In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('german_credit_data.csv')

# Drop unnecessary columns
df.drop(['Unnamed: 0', 'Checking account'], axis=1, inplace=True)

# Convert categorical variables to numerical form using label encoding
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['Housing'] = le.fit_transform(df['Housing'])
df['Saving accounts'] = le.fit_transform(df['Saving accounts'])
df['Purpose'] = le.fit_transform(df['Purpose'])

# Split the dataset into input and output variables
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Print the number of samples in each class before oversampling
print("Number of samples in each class before oversampling:")
print(y.value_counts())

# Oversample the minority class using KMeansSMOTE
from imblearn.over_sampling import KMeansSMOTE
kmeans_smote = KMeansSMOTE(random_state=42, cluster_balance_threshold=0.2)
X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)

# Print the number of samples in each class after oversampling
print("Number of samples in each class after oversampling:")
print(pd.Series(y_resampled).value_counts())

# Detect outliers using Isolation Forest
from sklearn.ensemble import IsolationForest
iso_forest = IsolationForest(random_state=42, contamination='auto')
y_outliers = iso_forest.fit_predict(X_resampled)

# Print the number of outliers detected
print("Number of outliers detected:")
print(pd.Series(y_outliers).value_counts())

# Remove outliers
X_no_outliers = X_resampled[y_outliers == 1]
y_no_outliers = y_resampled[y_outliers == 1]

# Print the number of samples in each class after removing outliers
print("Number of samples in each class after removing outliers:")
print(pd.Series(y_no_outliers).value_counts())

Number of samples in each class before oversampling:
good    700
bad     300
Name: Risk, dtype: int64




Number of samples in each class after oversampling:
bad     707
good    700
Name: Risk, dtype: int64
Number of outliers detected:
 1    979
-1    428
dtype: int64
Number of samples in each class after removing outliers:
bad     519
good    460
Name: Risk, dtype: int64


In [2]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier

from genetic_selection import GeneticSelectionCV
estimator = ExtraTreesClassifier()

selector = GeneticSelectionCV(estimator,cv=5,
                                  verbose=1,
                                  scoring="r2", 
                                  max_features=5,
                                  n_population=50,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=40,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.05,
                                  tournament_size=3,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-1)
selector = selector.fit(X_no_outliers, y_no_outliers)

Selecting features with genetic algorithm.
gen	nevals	avg             	std                            	min             	max             
0  	50    	[ nan  3.1  nan]	[      nan  1.431782       nan]	[ nan   1.  nan]	[ nan   5.  nan]
1  	23    	[  nan  3.38   nan]	[      nan  1.468196       nan]	[ nan   0.  nan]	[ nan   7.  nan]
2  	28    	[  nan  3.36   nan]	[      nan  1.452722       nan]	[ nan   1.  nan]	[ nan   5.  nan]
3  	37    	[  nan  3.34   nan]	[  nan  1.38   nan]            	[ nan   0.  nan]	[ nan   6.  nan]
4  	24    	[  nan  3.28   nan]	[      nan  1.371714       nan]	[ nan   0.  nan]	[ nan   6.  nan]
5  	26    	[ nan  3.5  nan]   	[      nan  1.459452       nan]	[ nan   0.  nan]	[ nan   6.  nan]
6  	26    	[  nan  3.28   nan]	[      nan  1.357056       nan]	[ nan   0.  nan]	[ nan   6.  nan]
7  	36    	[  nan  2.98   nan]	[      nan  1.174564       nan]	[ nan   1.  nan]	[ nan   6.  nan]
8  	24    	[  nan  2.84   nan]	[      nan  1.172348       nan]	[ nan   1.  nan]	[ nan   6.

In [3]:
print('Genetic algorithm result:')
print(selector.support_)
X_no_outliers = pd.DataFrame(X_no_outliers, columns=X.columns)
print('List of important features :')
print(X_no_outliers.columns[selector.support_])
print('List of less important features :')
print(X_no_outliers.columns[~selector.support_])

Genetic algorithm result:
[ True False False  True  True False  True  True]
List of important features :
Index(['Age', 'Housing', 'Saving accounts', 'Duration', 'Purpose'], dtype='object')
List of less important features :
Index(['Sex', 'Job', 'Credit amount'], dtype='object')
