In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler, minmax_scale
import pandas as pd
import numpy as np;
import scipy;
## plotting libraries
from matplotlib import pyplot as plt
import seaborn as sns
## stats Libraries
from scipy import stats
import statsmodels.api as sm
## Sklearn libraries
from sklearn import model_selection
from sklearn import metrics as metrics
from sklearn import preprocessing
from sklearn import linear_model as lm
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# load the data as dataframe
data = pd.read_csv("data/data_cleaned_credit.csv")

In [4]:
data.columns

Index(['Unnamed: 0', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'CNT_FAM_MEMBERS', 'STATUS', 'dmy_M', 'dmy_Y', 'dmy_Y.1',
       'dmy_Pensioner', 'dmy_State servant', 'dmy_Student', 'dmy_Working',
       'dmy_High school degree', 'dmy_Higher education', 'dmy_Lower secondary',
       'dmy_Married', 'dmy_Separated', 'dmy_Single / not married', 'dmy_Widow',
       'dmy_House / apartment', 'dmy_Municipal apartment',
       'dmy_Office apartment', 'dmy_Rented apartment', 'dmy_With parents',
       'dmy_Cleaning staff', 'dmy_Cooking staff', 'dmy_Core staff',
       'dmy_Drivers', 'dmy_HR staff', 'dmy_High skill tech staff',
       'dmy_IT staff', 'dmy_Laborers', 'dmy_Low-skill Laborers',
       'dmy_Managers', 'dmy_Medicine staff', 'dmy_Other',
       'dmy_Private service staff', 'dmy_Realty agents', 'dmy_Sales staff',
       'dmy_Secretaries', 'dmy_Security staff', 'dmy_Waiters/barmen staff'],
      dtype='object')

In [5]:
# drop columns
data = data.drop(columns=['Unnamed: 0', 'dmy_Y', 'dmy_Y.1','dmy_Cleaning staff', 'dmy_Cooking staff', 'dmy_Core staff',
       'dmy_Drivers', 'dmy_HR staff', 'dmy_High skill tech staff', 'dmy_IT staff', 'dmy_Laborers', 'dmy_Low-skill Laborers',
       'dmy_Managers', 'dmy_Medicine staff', 'dmy_Other', 'dmy_Private service staff', 'dmy_Realty agents', 'dmy_Sales staff',
       'dmy_Secretaries', 'dmy_Security staff', 'dmy_Waiters/barmen staff'])

In [6]:
data

Unnamed: 0,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,CNT_FAM_MEMBERS,STATUS,dmy_M,dmy_Pensioner,dmy_State servant,dmy_Student,dmy_Working,...,dmy_Lower secondary,dmy_Married,dmy_Separated,dmy_Single / not married,dmy_Widow,dmy_House / apartment,dmy_Municipal apartment,dmy_Office apartment,dmy_Rented apartment,dmy_With parents
0,427500.0,33,12,2,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,427500.0,33,12,2,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,112500.0,59,3,2,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
3,270000.0,52,8,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
4,270000.0,52,8,1,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22426,130500.0,44,26,2,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
22427,315000.0,48,7,2,1,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0
22428,157500.0,34,4,2,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
22429,157500.0,34,4,2,1,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [7]:
# X / y split
y = data['STATUS']
X = data.drop(['STATUS'], axis = 1)

In [8]:
# train/test split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.20, random_state=42)

In [9]:
# upsampling with smote
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train, y_train)
y_sm.value_counts()

0    15181
1    15181
Name: STATUS, dtype: int64

In [10]:
# defining the different hyperparameters to test
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [ 100, 150, 200],
    'min_samples_split': [2, 3, ],
    'min_samples_leaf' : [1, 2, ],
    'max_features': ['sqrt'],
    'max_depth':[5,8,12],
    'bootstrap':[True] 
    #'max_samples' : ['None', 0.5],

    }
clf = RandomForestClassifier(random_state=100)

In [11]:
# safe the parameters as a variable
grid_search = GridSearchCV(clf, param_grid,return_train_score=True,n_jobs=-1, scoring="recall")

In [12]:
# Run all models with the different parameters
grid_search.fit(X_sm,y_sm)

GridSearchCV(estimator=RandomForestClassifier(random_state=100), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [5, 8, 12],
                         'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [100, 150, 200]},
             return_train_score=True, scoring='recall')

In [13]:
# get the best parameters 
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [14]:
# redefine the model with the best hyperparameters
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=100, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=150, max_depth=12, bootstrap=True)
cross_val_scores_train = cross_val_score(clf, X_sm, y_sm, cv=5)
print(np.mean(cross_val_scores_train))


0.8008405256901809


In [15]:
# run the model with the upsampled data
clf.fit(X_sm, y_sm)

RandomForestClassifier(max_depth=12, max_features='sqrt', n_estimators=150,
                       random_state=100)

In [16]:
len(X_train.columns)

21

In [17]:
# store the feature names in a list
feature_names = X_train.columns
feature_names = list(feature_names)

In [18]:
# list the feature in a dataframe from most to least important feature
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
0,AMT_INCOME_TOTAL,0.139271
1,DAYS_BIRTH,0.131133
2,DAYS_EMPLOYED,0.104996
9,dmy_High school degree,0.098302
10,dmy_Higher education,0.084961
14,dmy_Single / not married,0.055804
12,dmy_Married,0.052687
3,CNT_FAM_MEMBERS,0.050007
13,dmy_Separated,0.047552
8,dmy_Working,0.046685


In [19]:
# calculate cross validation
cross_val_scores_test = cross_val_score(clf, X_test, y_test, cv=10)
print(np.mean(cross_val_scores_train))
print(np.mean(cross_val_scores_test))

0.8008405256901809
0.8455521197900095


In [20]:
cross_val_scores_test

array([0.84632517, 0.84632517, 0.84632517, 0.85077951, 0.84855234,
       0.844098  , 0.844098  , 0.84151786, 0.84598214, 0.84151786])

In [21]:
from sklearn.metrics import recall_score

In [22]:
# predict on x test
y_pred= clf.predict(X_test)

In [23]:
y_pred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
# calculate the recall score
recall_score(y_test, y_pred)

0.30434782608695654