<a href="https://colab.research.google.com/github/yal521/DSCI441-machine-learning-project/blob/main/Feature_Selection_PSO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Feature Selection：Particle Swarm Optimization

In [None]:
# install niapy for the use of swarm intelligence
!pip install niapy --pre

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Defining class SVMFeatureSelection

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
import pandas as pd

from niapy.problems import Problem
from niapy.task import Task
from niapy.algorithms.basic import ParticleSwarmOptimization


class SVMFeatureSelection(Problem):
    def __init__(self, X_train, y_train, alpha=0.99):
        super().__init__(dimension=X_train.shape[1], lower=0, upper=1)
        self.X_train = X_train
        self.y_train = y_train
        self.alpha = alpha

    def _evaluate(self, x):
        selected = x > 0.5
        num_selected = selected.sum()
        if num_selected == 0:
            return 1.0
        accuracy = cross_val_score(SVC(), self.X_train[:, selected], self.y_train, cv=3, n_jobs=-1).mean()
        score = 1 - accuracy
        num_features = self.X_train.shape[1]
        return self.alpha * score + (1 - self.alpha) * (num_selected / num_features)

In [None]:
import os
import numpy as np
import pandas as pd
from google.colab import drive
from random import sample
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Yang_DSCI441_Final_Project')
Credit_data_TR = pd.read_csv("Credit_data_TR_after_SMOTE.csv")
Credit_data_Label = pd.read_csv("Credit_data_Label_after_SMOTE.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# extract a column as a list

Class = Credit_data_Label['Class'].tolist()

# print the resulting list
# print(Class)

# add the new list as a new column to the dataframe

Credit_data_TR['Class'] = Class
Credit_data3=Credit_data_TR

# print the resulting dataframe
# Credit_data3.head()

Credit_data4 = Credit_data3.loc[Credit_data3['Class'] == 0].sample(n=int(500))
Credit_data5 = Credit_data3.loc[Credit_data3['Class'] == 1].sample(n=int(500))
Credit_data6 = pd.concat([Credit_data4, Credit_data5])
Credit_data6.shape

(1000, 31)

In [None]:
# Split Credit_data6 into feature parts and label parts
Transaction = Credit_data6.drop(['Class'],axis ='columns')
Label = Credit_data6 ['Class']
feature_names = Transaction.columns

In [None]:
a = Transaction.to_numpy()
b = Label.to_numpy()
c = feature_names.to_numpy()


a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.20, stratify=b, random_state=1234)

problem = SVMFeatureSelection(a_train,b_train)
task = Task(problem, max_iters=100)
algorithm = ParticleSwarmOptimization(population_size=10, seed=1234)
best_features, best_fitness = algorithm.run(task)

selected_features = best_features > 0.5
print('Number of selected features:', selected_features.sum())
print('Selected features:', ', '.join(c[selected_features].tolist()))

model_selected = SVC()
model_all = SVC()

model_selected.fit(a_train[:, selected_features], b_train)
print('Subset accuracy:', model_selected.score(a_test[:, selected_features], b_test))

model_all.fit(a_train, b_train)
print('All Features Accuracy:', model_all.score(a_test, b_test))

Number of selected features: 13
Selected features: V1, V3, V6, V7, V9, V10, V11, V14, V17, V18, V19, V24, V26
Subset accuracy: 0.995
All Features Accuracy: 0.565
