In [1]:
# Import the required libraries
import pandas as pd
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import KMeansSMOTE

# Load the dataset
df = pd.read_csv('australian.dat', sep=' ', header=None)

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Print the number of samples in each class before oversampling
print("Number of samples in each class before oversampling:")
print(y.value_counts())

# Perform KMeansSMOTE to oversample the minority class
kmeans_smote = KMeansSMOTE(random_state=42, cluster_balance_threshold=0.2)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)

# Print the number of samples in each class after oversampling
print("Number of samples in each class after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Perform outlier detection using Isolation Forest
iso_forest = IsolationForest(random_state=42, contamination='auto')
y_outliers = iso_forest.fit_predict(X_resampled)

# Print the number of outliers detected
print("Number of outliers detected:")
print(pd.Series(y_outliers).value_counts())

# Remove the outliers
X_no_outliers = X_resampled[y_outliers == 1]
y_no_outliers = y_resampled[y_outliers == 1]

# Print the number of samples in each class after removing outliers
print("Number of samples in each class after removing outliers:")
print(pd.Series(y_no_outliers).value_counts())

Number of samples in each class before oversampling:
0    383
1    307
Name: 14, dtype: int64




Number of samples in each class after oversampling:
1    390
0    383
Name: 14, dtype: int64
Number of outliers detected:
 1    619
-1    154
dtype: int64
Number of samples in each class after removing outliers:
0    321
1    298
Name: 14, dtype: int64


In [2]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier

from genetic_selection import GeneticSelectionCV
estimator = ExtraTreesClassifier()

selector = GeneticSelectionCV(estimator,cv=5,
                                  verbose=1,
                                  scoring="r2", 
                                  max_features=10,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
selector = selector.fit(X_no_outliers, y_no_outliers)

Selecting features with genetic algorithm.
gen	nevals	avg                            	std                            	min                            	max                               
0  	50    	[-0.07538   4.94      0.178054]	[ 0.459679  2.781438  0.06617 ]	[-1.045155  1.        0.059859]	[  0.534269  10.         0.322818]
1  	27    	[-199.778174    6.8       200.188574]	[ 1400.03172      2.465766  1399.973062]	[-10000.            1.            0.048153]	[     0.534269     11.        10000.      ]
2  	30    	[-199.553663    8.02      200.149058]	[ 1400.063768     1.760568  1399.978706]	[-10000.            3.            0.095074]	[     0.560215     12.        10000.      ]
3  	28    	[ 0.487018  8.12      0.146276]      	[ 0.15735   1.409113  0.035252]         	[-0.300165  5.        0.051226]            	[  0.560215  10.         0.277992]         
4  	36    	[ 0.528264  8.14      0.151418]      	[ 0.114465  1.113732  0.029314]         	[-0.241808  5.        0.078037]            	[  0.

In [3]:
print('Genetic algorithm result:')
print(selector.support_)
X_no_outliers = pd.DataFrame(X_no_outliers, columns=X.columns)
print('List of important features :')
print(X_no_outliers.columns[selector.support_])
print('List of less important features :')
print(X_no_outliers.columns[~selector.support_])

Genetic algorithm result:
[False  True  True False  True False False  True False False  True  True
  True  True]
List of important features :
Int64Index([1, 2, 4, 7, 10, 11, 12, 13], dtype='int64')
List of less important features :
Int64Index([0, 3, 5, 6, 8, 9], dtype='int64')
