In [1]:
# Importing all libaries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer, StandardScaler, minmax_scale
import pandas as pd
import numpy as np;
import scipy;
## plotting libraries
from matplotlib import pyplot as plt
import seaborn as sns
## stats Libraries
from scipy import stats
import statsmodels.api as sm
## Sklearn libraries
from sklearn import model_selection
from sklearn import metrics as metrics
from sklearn import preprocessing
from sklearn import linear_model as lm
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

### Don't run the cells below, the model takes a long computation time

In [25]:
#load the CSV data
data = pd.read_csv("data/loan_cleaned.csv")

In [26]:
# Check the target variable for inbalances
data["status"].value_counts()

0    112031
1     36639
Name: status, dtype: int64

In [3]:
data

Unnamed: 0,loan_amount,rate_of_interest,interest_rate_spread,upfront_charges,term,property_value,income,credit_score,ltv,status,...,dmy_45-54,dmy_55-64,dmy_65-74,dmy_<25,dmy_>74,dmy_to_inst,dmy_North-East,dmy_central,dmy_south,dmy_direct
0,116500,3.990000,-0.028000,0.000000,360.0,118000.000000,1740.0,758,98.728814,1,...,0,0,0,0,0,1,0,0,1,1
1,206500,4.045475,0.441651,3224.966547,360.0,497893.465696,4980.0,552,72.746457,1,...,0,1,0,0,0,1,0,0,0,1
2,406500,4.560000,0.200000,595.000000,360.0,508000.000000,9480.0,834,80.019685,0,...,0,0,0,0,0,1,0,0,1,1
3,456500,4.250000,0.681000,3224.966547,360.0,658000.000000,11880.0,587,69.376900,0,...,1,0,0,0,0,0,0,0,0,1
4,696500,4.000000,0.304200,0.000000,360.0,758000.000000,10440.0,602,91.886544,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,436500,3.125000,0.257100,9960.000000,180.0,608000.000000,7860.0,659,71.792763,0,...,0,1,0,0,0,1,0,0,1,1
148666,586500,5.190000,0.854400,0.000000,360.0,788000.000000,7140.0,569,74.428934,0,...,0,0,0,0,0,0,0,0,1,1
148667,446500,3.125000,0.081600,1226.640000,180.0,728000.000000,6900.0,702,61.332418,0,...,1,0,0,0,0,0,0,0,0,1
148668,196500,3.500000,0.582400,4323.330000,180.0,278000.000000,7140.0,737,70.683453,0,...,0,1,0,0,0,1,0,0,0,1


In [5]:
# Define y and X
y = data['status']
X = data.drop(["status"], axis = 1)

In [6]:
# Split the data in train and test set
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.20, random_state=42)

In [7]:
# Upscale the data with Smote
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X_train, y_train)
y_sm.value_counts()

0    89537
1    89537
Name: status, dtype: int64

In [8]:
# Define the Hyperparameters to run the model with to find the best parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [ 50, 150, 200],
    'min_samples_split': [2, 3],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt'],
    'max_depth':[4,6,12],
    'bootstrap':[True] 
    #'max_samples' : ['None', 0.5],

    }
clf = RandomForestClassifier(random_state=100)

In [9]:
# Define the GridSearch function to run the model with all parameters defined in param_grid
grid_search = GridSearchCV(clf, param_grid,return_train_score=True,n_jobs=-1, scoring="recall")

In [10]:
# Run the all models with the upsacled data, this Cell runs ~ 20 min
grid_search.fit(X_sm,y_sm)

GridSearchCV(estimator=RandomForestClassifier(random_state=100), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [4, 6, 12],
                         'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [50, 150, 200]},
             return_train_score=True, scoring='recall')

In [11]:
# Get the best possible combinations of hyperparameters 
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 12,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

In [12]:
# Redefine the model again with the best hyperparameters and calculate the cross validation
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(random_state=100, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=50, max_depth=12, bootstrap=True)
cross_val_scores_train = cross_val_score(clf, X_sm, y_sm, cv=5)
print(np.mean(cross_val_scores_train))


0.9999050664616707


In [13]:
# Run the new model on the scaled data
clf.fit(X_sm, y_sm)

RandomForestClassifier(max_depth=12, max_features='sqrt', n_estimators=50,
                       random_state=100)

In [14]:
# Getting the number of columns
len(X_train.columns)

45

In [15]:
# Creatating a list with the column names
feature_names = X_train.columns
feature_names = list(feature_names)

In [16]:
# Creating a dataframe with the each columns feature importance, ranked from high to low
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
2,interest_rate_spread,0.2850252
1,rate_of_interest,0.2784423
3,upfront_charges,0.2334088
31,dmy_EQUI,0.04560415
30,dmy_CRIF,0.02128183
9,dtir1,0.02017571
8,ltv,0.01716115
20,dmy_not_neg,0.01689276
22,dmy_not_lpsm,0.01077016
32,dmy_EXP,0.01004279


In [17]:
# Calculate the cross validation for the training and test set
cross_val_scores_test = cross_val_score(clf, X_test, y_test, cv=10)
print(np.mean(cross_val_scores_train))
print(np.mean(cross_val_scores_test))

0.9999050664616707
0.9999663752521857


In [18]:
cross_val_scores_test

array([1.        , 1.        , 1.        , 0.99966375, 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [16]:
# Import the recall score function
from sklearn.metrics import recall_score

In [20]:
# Safe the predction for X_test in a variable
y_pred= clf.predict(X_test)

In [21]:
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

In [22]:
# Calculate the recall score
recall_score(y_test, y_pred)

0.9998618784530386

### Run the Cells from here on

In [38]:
# Importing the data as CSV
data2=pd.read_csv("data/loan_cleaned.csv")

In [39]:
# Defining y and X with much less feature for nicer user experience when calculating customers loan chances
y2 = data2['status']
X2 = data2[['loan_amount','income', 'credit_score', 'dmy_CRIF','dmy_EQUI', 'dmy_EXP']]
#X2 = data2[['loan_amount', 'rate_of_interest', 'interest_rate_spread','upfront_charges',
#       'income', 'credit_score', 'dmy_CRIF','dmy_EQUI', 'dmy_EXP']]

In [40]:
# Rounding the X-data to 2 decimals
X2= X2.round({'loan_amount':2,'income':2})
#X2= X2.round({'loan_amount': 2, 'rate_of_interest': 2, 'interest_rate_spread': 2, 'upfront_charges': 2,
#             'income':2})

In [41]:
# Split the data in training and test set
X_train2, X_test2, y_train2, y_test2=train_test_split(X2, y2, test_size=.30, random_state=42)

In [42]:
# Upsampling the data with smote
smote = SMOTE()
X_sm2, y_sm2 = smote.fit_resample(X_train2, y_train2)
y_sm2.value_counts()

1    78344
0    78344
Name: status, dtype: int64

In [43]:
# Define the Hyperparameters to run the model with to find the best parameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
param_grid2 = {
    'n_estimators': [ 50, 150],
    'min_samples_split': [2, 3],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt'],
    'max_depth':[12,5],
    'bootstrap':[True] 
    #'max_samples' : ['None', 0.5],

    }
clf2 = RandomForestClassifier(random_state=100)

In [44]:
# Define the GridSearch function to run the model with all parameters defined in param_grid
grid_search2 = GridSearchCV(clf2, param_grid2,return_train_score=True,n_jobs=-1, scoring="recall")

In [45]:
# Run the all models with the upsacled data
grid_search2.fit(X_sm2,y_sm2)

GridSearchCV(estimator=RandomForestClassifier(random_state=100), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [12, 5],
                         'max_features': ['sqrt'], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 3],
                         'n_estimators': [50, 150]},
             return_train_score=True, scoring='recall')

In [46]:
# Get the best possible combinations of hyperparameters 
grid_search2.best_params_

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 150}

In [47]:
# Redefine the model again with the best hyperparameters and calculate the cross validation
from sklearn.model_selection import cross_val_score
clf2 = RandomForestClassifier(random_state=100, max_features='sqrt', 
                             min_samples_leaf=2, min_samples_split=2, n_estimators=150, max_depth=5, bootstrap=True)
cross_val_scores_train2 = cross_val_score(clf2, X_sm2, y_sm2, cv=5)
print(np.mean(cross_val_scores_train2))

0.7390680770034584


In [48]:
# # Run the new model on the scaled data
clf2.fit(X_sm2, y_sm2)


RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=2,
                       n_estimators=150, random_state=100)

In [49]:
# Calculate the cross validation for the training and test set
cross_val_scores_test2 = cross_val_score(clf2, X_test2, y_test2, cv=10)
print(np.mean(cross_val_scores_train2))
print(np.mean(cross_val_scores_test2))

0.7390680770034584
0.8611914268453151


In [50]:
# Import the recall score function
from sklearn.metrics import recall_score

In [51]:
# Safe the predction for X_test in a variable
y_pred2 = clf2.predict(X_test2)

In [52]:
y_pred2

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [53]:
# Calculate the recall score
recall_score(y_test2, y_pred2)

0.6333150082462892

In [54]:
# Creating a new data frame to compare results
results = pd.DataFrame({"prediction":y_pred2, "real":y_test2})


In [55]:
pd.set_option('display.max_rows', 500)
results.head(300)

Unnamed: 0,prediction,real
24912,0,0
147068,0,0
123284,0,0
53610,0,0
39672,0,0
4656,0,0
38443,0,0
89423,1,0
41909,0,0
57290,1,0


### User Input for the interface

In [68]:
# Define variables with user input
#i_upfront_charges = float(input("Please enter upfront charges: "))
#i_interest_rate = float(input("Please enter interest rate: "))
#i_interest_rate_spread = float(input("Please enter interest rate spread: "))
i_EQUI = int(input("Credit Type EQUI: "))
i_CRIF = int(input("Credit Type CRIF: "))
i_EXP = int(input("Credit Type EXP: "))
i_income = float(input("Please enter the income: "))
i_loan_amount = float(input("Please enter loan_amount requested: "))
i_credit_score = int(input("Please enter credit score: "))


Credit Type EQUI: 1
Credit Type CRIF: 0
Credit Type EXP: 0
Please enter the income: 500
Please enter loan_amount requested: 1000000
Please enter credit score: 120


In [69]:
# Creating a new DF with the user input 
#new_data = pd.DataFrame(columns=["i_upfront_charges","i_interest_rate", "i_interest_rate_spread", "i_EQUI", "i_CRIF", "i_EXP",
#                                "i_income", "i_loan_amount", "i_credit_score"])
#new_data.loc[0] = pd.Series({"i_upfront_charges":i_upfront_charges,"i_interest_rate":i_interest_rate, "i_interest_rate_spread":i_interest_rate_spread,
#                             "i_EQUI":i_EQUI, "i_CRIF":i_CRIF,"i_EXP":i_EXP,"i_income":i_income, "i_loan_amount":i_loan_amount,
#                               "i_credit_score":i_credit_score})
new_data = pd.DataFrame(columns=["i_EQUI", "i_CRIF", "i_EXP","i_income", "i_loan_amount", "i_credit_score"])
new_data.loc[0] = pd.Series({"i_EQUI":i_EQUI, "i_CRIF":i_CRIF,"i_EXP":i_EXP,"i_income":i_income, "i_loan_amount":i_loan_amount,
                               "i_credit_score":i_credit_score})



In [70]:
new_data

Unnamed: 0,i_EQUI,i_CRIF,i_EXP,i_income,i_loan_amount,i_credit_score
0,1.0,0.0,0.0,500.0,1000000.0,120.0


In [71]:
# Predict the target with the user input data
new_prediction=clf2.predict(new_data)

In [72]:
# Check the predictions result
new_prediction[0]

1

In [73]:
# Check the probability
clf2.predict_proba(new_data)

array([[0.20212561, 0.79787439]])