In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import KMeansSMOTE

# Load the dataset
df = pd.read_csv('UCI_Credit_Card.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Print the number of samples in each class before oversampling
print("Number of samples in each class before oversampling:")
print(y.value_counts())

# Perform KMeansSMOTE to oversample the minority class
kmeans_smote = KMeansSMOTE(random_state=42, cluster_balance_threshold=0.2)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)

# Print the number of samples in each class after oversampling
print("Number of samples in each class after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Perform outlier detection using Isolation Forest
iso_forest = IsolationForest(random_state=42, contamination='auto')
y_outliers = iso_forest.fit_predict(X_resampled)

# Print the number of outliers detected
print("Number of outliers detected:")
print(pd.Series(y_outliers).value_counts())

# Remove the outliers
X_no_outliers = X_resampled[y_outliers == 1]
y_no_outliers = y_resampled[y_outliers == 1]

# Print the number of samples in each class after removing outliers
print("Number of samples in each class after removing outliers:")
print(pd.Series(y_no_outliers).value_counts())

Number of samples in each class before oversampling:
0    23364
1     6636
Name: default.payment.next.month, dtype: int64




Number of samples in each class after oversampling:
1    23367
0    23364
Name: default.payment.next.month, dtype: int64
Number of outliers detected:
 1    43148
-1     3583
dtype: int64
Number of samples in each class after removing outliers:
1    22281
0    20867
Name: default.payment.next.month, dtype: int64


In [2]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     ------------------------------------ 226.0/226.0 kB 726.5 kB/s eta 0:00:00
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0 joblib-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install sklearn-genetic

Collecting sklearn-genetic
  Downloading sklearn_genetic-0.5.1-py3-none-any.whl (11 kB)
Collecting deap>=1.0.2
  Downloading deap-1.3.3-cp39-cp39-win_amd64.whl (114 kB)
     ------------------------------------ 114.3/114.3 kB 738.0 kB/s eta 0:00:00
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
     -------------------------------------- 132.9/132.9 kB 1.3 MB/s eta 0:00:00
Collecting dill>=0.3.6
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
     -------------------------------------- 110.5/110.5 kB 2.1 MB/s eta 0:00:00
Installing collected packages: dill, deap, multiprocess, sklearn-genetic
  Attempting uninstall: dill
    Found existing installation: dill 0.3.4
    Uninstalling dill-0.3.4:
      Successfully uninstalled dill-0.3.4
Successfully installed deap-1.3.3 dill-0.3.6 multiprocess-0.70.14 sklearn-genetic-0.5.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier

from genetic_selection import GeneticSelectionCV
estimator = ExtraTreesClassifier()

selector = GeneticSelectionCV(estimator,cv=5,
                                  verbose=1,
                                  scoring="r2", 
                                  max_features=10,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
selector = selector.fit(X_no_outliers, y_no_outliers)

Selecting features with genetic algorithm.
gen	nevals	avg                            	std                            	min                            	max                               
0  	50    	[-0.258389  5.72      0.315346]	[ 0.529035  3.187726  0.089977]	[-1.581147  1.        0.02519 ]	[  0.407458  10.         0.460512]
1  	25    	[-399.797131    7.12      400.348406]	[ 1959.633212     2.688048  1959.520677]	[-10000.            1.            0.261438]	[     0.41943     11.       10000.     ]
2  	22    	[-799.710737    8.66      800.34238 ]	[ 2713.017295     1.893251  2712.831031]	[-10000.           3.           0.28254]   	[     0.420637     11.        10000.      ]
3  	25    	[-1199.671826     9.42      1200.3268  ]	[ 3249.736549     1.312859  3249.494683]	[-10000.            5.            0.339208]	[     0.439198     13.        10000.      ]
4  	25    	[-1399.655599     9.66      1400.31958 ]	[ 3470.009271     1.176605  3469.741373]	[-10000.            7.            0.337742]	[ 

In [3]:
print('Genetic algorithm result:')
print(selector.support_)
X_no_outliers = pd.DataFrame(X_no_outliers, columns=X.columns)
print('List of important features :')
print(X_no_outliers.columns[selector.support_])
print('List of less important features :')
print(X_no_outliers.columns[~selector.support_])

Genetic algorithm result:
[False  True False False False False  True False False False False False
  True False  True False  True  True  True False False  True  True  True]
List of important features :
Index(['LIMIT_BAL', 'PAY_0', 'BILL_AMT1', 'BILL_AMT3', 'BILL_AMT5',
       'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')
List of less important features :
Index(['ID', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6', 'BILL_AMT2', 'BILL_AMT4', 'PAY_AMT2', 'PAY_AMT3'],
      dtype='object')
