In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import random
import time

random.seed(100)

## Data Preprocessing

In [19]:
dataset = pd.read_csv('financial_data.csv')
dataset.columns

Index(['entry_id', 'age', 'pay_schedule', 'home_owner', 'income',
       'months_employed', 'years_employed', 'current_address_year',
       'personal_account_m', 'personal_account_y', 'has_debt',
       'amount_requested', 'risk_score', 'risk_score_2', 'risk_score_3',
       'risk_score_4', 'risk_score_5', 'ext_quality_score',
       'ext_quality_score_2', 'inquiries_last_month', 'e_signed'],
      dtype='object')

## Feature Engineering

In [20]:
dataset = dataset.drop(columns = ['months_employed'])
dataset['personal_account_months'] = (dataset.personal_account_m + (dataset.personal_account_y * 12))
dataset[['personal_account_m', 'personal_account_y', 'personal_account_months']].head()
dataset = dataset.drop(columns = ['personal_account_m','personal_account_y'])

In [21]:
dataset.head()

Unnamed: 0,entry_id,age,pay_schedule,home_owner,income,years_employed,current_address_year,has_debt,amount_requested,risk_score,risk_score_2,risk_score_3,risk_score_4,risk_score_5,ext_quality_score,ext_quality_score_2,inquiries_last_month,e_signed,personal_account_months
0,7629673,40,bi-weekly,1,3135,3,3,1,550,36200,0.737398,0.903517,0.487712,0.515977,0.580918,0.380918,10,1,30
1,3560428,61,weekly,0,3180,6,3,1,600,30150,0.73851,0.881027,0.713423,0.826402,0.73072,0.63072,9,0,86
2,6934997,23,weekly,0,1540,0,0,1,450,34550,0.642993,0.766554,0.595018,0.762284,0.531712,0.531712,7,0,19
3,5682812,40,bi-weekly,0,5230,6,1,1,700,42150,0.665224,0.960832,0.767828,0.778831,0.792552,0.592552,8,1,86
4,5335819,33,semi-monthly,0,3590,5,2,1,1100,53850,0.617361,0.85756,0.613487,0.665523,0.744634,0.744634,12,0,98


## One hot Encoding

In [22]:
dataset = pd.get_dummies(dataset)

In [23]:
dataset.columns

Index(['entry_id', 'age', 'home_owner', 'income', 'years_employed',
       'current_address_year', 'has_debt', 'amount_requested', 'risk_score',
       'risk_score_2', 'risk_score_3', 'risk_score_4', 'risk_score_5',
       'ext_quality_score', 'ext_quality_score_2', 'inquiries_last_month',
       'e_signed', 'personal_account_months', 'pay_schedule_bi-weekly',
       'pay_schedule_monthly', 'pay_schedule_semi-monthly',
       'pay_schedule_weekly'],
      dtype='object')

In [24]:
dataset = dataset.drop(columns = ['pay_schedule_semi-monthly'])

## Removing Extra Columns

In [25]:
response = dataset['e_signed']
users = dataset['entry_id']
dataset = dataset.drop(columns = ['e_signed','entry_id'])

## Spliting into Train and test Set

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, response, test_size=0.2, random_state=0)

## Feature Scaling

In [27]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.fit_transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
X_train = X_train2
X_test = X_test2

## Building Model

## Comparing Models

### Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, penalty='l1')
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Predicting test set

In [31]:
y_pred = classifier.predict(X_test)

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [35]:
results = pd.DataFrame([['Linear Regression (Lasso)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])

### SVM (Linear)

In [36]:
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel='linear')
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

### Predicting test set

In [37]:
y_pred = classifier.predict(X_test)

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [40]:
model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])

In [41]:
results = results.append(model_results, ignore_index=True)

In [42]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.563372,0.577778,0.701245,0.633552
1,SVM (Linear),0.568398,0.578536,0.729772,0.645413


### SVM (rbf)

In [43]:
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel='rbf')
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

### Predicting test set

In [44]:
y_pred = classifier.predict(X_test)

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [46]:
model_results = pd.DataFrame([['SVM (rbf)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])

In [47]:
results = results.append(model_results, ignore_index=True)

In [48]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.563372,0.577778,0.701245,0.633552
1,SVM (Linear),0.568398,0.578536,0.729772,0.645413
2,SVM (rbf),0.592686,0.607519,0.687241,0.644926


In [49]:
### Random forest (n=100)

In [51]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0, n_estimators = 100, criterion='entropy')
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [52]:
### Predicting test set

In [53]:
y_pred = classifier.predict(X_test)

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [55]:
model_results = pd.DataFrame([['Random Forest (n=100)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])

In [56]:
results = results.append(model_results, ignore_index=True)

In [57]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.563372,0.577778,0.701245,0.633552
1,SVM (Linear),0.568398,0.578536,0.729772,0.645413
2,SVM (rbf),0.592686,0.607519,0.687241,0.644926
3,Random Forest (n=100),0.623953,0.643741,0.674793,0.658901


## Parameter Tuning of Random Forest Classifier

### Applying Grid Search

##### Round 1 : Entropy

In [62]:
parameters = {
    "max_depth": [3, None],
    "max_features": [1, 5 ,10],
    "min_samples_split": [2, 5 ,10],
    "min_samples_leaf": [1, 5 ,10],
    "bootstrap": [True, False],
    "criterion": ['entropy'],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy' , cv=10, n_jobs=-1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds " % (t1 - t0))

Took 595.45 seconds 


In [63]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

(0.6356275303643725,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 5,
  'min_samples_leaf': 1,
  'min_samples_split': 10})

##### Round 2 : Entropy

In [64]:
parameters = {
    "max_depth": [None],
    "max_features": [3, 5 ,7],
    "min_samples_split": [8, 10,12],
    "min_samples_leaf": [1, 2 ,3],
    "bootstrap": [True],
    "criterion": ['entropy'],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy' , cv=10, n_jobs=-1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds " % (t1 - t0))

Took 203.74 seconds 


In [65]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

(0.6356275303643725,
 {'bootstrap': True,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 5,
  'min_samples_leaf': 1,
  'min_samples_split': 10})

### Predicting test set

In [66]:
y_pred = classifier.predict(X_test)

In [67]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [69]:
model_results = pd.DataFrame([['Random Forest (n=100 GSx2 + Entropy)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])
results = results.append(model_results, ignore_index=True)
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.563372,0.577778,0.701245,0.633552
1,SVM (Linear),0.568398,0.578536,0.729772,0.645413
2,SVM (rbf),0.592686,0.607519,0.687241,0.644926
3,Random Forest (n=100),0.623953,0.643741,0.674793,0.658901
4,Random Forest (n=100),0.623953,0.643741,0.674793,0.658901
5,Random Forest (n=100 GSx2 + Entropy),0.623953,0.643741,0.674793,0.658901


##### Round 1 : Gini

In [70]:
parameters = {
    "max_depth": [3, None],
    "max_features": [1, 5 ,10],
    "min_samples_split": [2, 5 ,10],
    "min_samples_leaf": [1, 5 ,10],
    "bootstrap": [True, False],
    "criterion": ['gini'],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy' , cv=10, n_jobs=-1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds " % (t1 - t0))

Took 400.01 seconds 


In [71]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

(0.6363953650705012,
 {'bootstrap': True,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': 10,
  'min_samples_leaf': 10,
  'min_samples_split': 2})

##### Round 2 : Gini

In [72]:
parameters = {
    "max_depth": [None],
    "max_features": [8, 10 ,12],
    "min_samples_split": [2, 3,4],
    "min_samples_leaf": [8, 10 ,12],
    "bootstrap": [True],
    "criterion": ['gini'],
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy' , cv=10, n_jobs=-1)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
t1 = time.time()
print("Took %0.2f seconds " % (t1 - t0))

Took 193.55 seconds 


In [73]:
rf_best_accuracy = grid_search.best_score_
rf_best_parameters = grid_search.best_params_
rf_best_accuracy, rf_best_parameters

(0.6386988691888873,
 {'bootstrap': True,
  'criterion': 'gini',
  'max_depth': None,
  'max_features': 12,
  'min_samples_leaf': 12,
  'min_samples_split': 2})

### Predicting test set

In [74]:
y_pred = classifier.predict(X_test)

In [75]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec= recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [76]:
model_results = pd.DataFrame([['Random Forest (n=100 GSx2 + Gini)', acc, prec, rec, f1]] , columns=['Model', 'Accuracy','Precision', 'Recall', 'F1 Score'])
results = results.append(model_results, ignore_index=True)
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Linear Regression (Lasso),0.563372,0.577778,0.701245,0.633552
1,SVM (Linear),0.568398,0.578536,0.729772,0.645413
2,SVM (rbf),0.592686,0.607519,0.687241,0.644926
3,Random Forest (n=100),0.623953,0.643741,0.674793,0.658901
4,Random Forest (n=100),0.623953,0.643741,0.674793,0.658901
5,Random Forest (n=100 GSx2 + Entropy),0.623953,0.643741,0.674793,0.658901
6,Random Forest (n=100 GSx2 + Gini),0.623953,0.643741,0.674793,0.658901


# End of Model

## Formating Final Results

In [79]:
final_results = pd.concat([y_test, users],  axis = 1).dropna()
final_results['predictions'] = y_pred
final_results = final_results[['entry_id', 'e_signed', 'predictions']]
final_results.head()

Unnamed: 0,entry_id,e_signed,predictions
8,6493191,1.0,0
9,8908605,1.0,1
12,6889184,1.0,0
16,9375601,0.0,1
18,8515555,1.0,1
