In [1]:
import pandas as pd

# Problem 3: Credit scoring 

Credit scoring algorithms, which make a guess at the probability of default, are the method banks use to determine whether or not a loan should be granted.
In this assignment, you will build a Random Forest to predict whether the customer will repay their credit within 90 days.

In [2]:
# load the data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/credit_scoring.csv'
credit_scoring = pd.read_csv(url)
credit_scoring.head()

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.00019,0,0,10500.0,2.0
4,1,49,0,0.27182,0,0,400.0,0.0


**Data Description**

| Feature | Description |
| :- | -: |
|SeriousDlqin2yrs (target variable) | Customer hasn't paid the loan debt within 90 days 
|age	| Customer age
|DebtRatio | Total monthly loan payments (loan, alimony, etc.) / Total monthly income percentage
|NumberOfTime30-59DaysPastDueNotWorse | The number of cases when client has overdue 30-59 days (not worse) on other loans |during the last 2 years
|NumberOfTimes90DaysLate	Input Feature | Number of cases when customer had 90+dpd overdue on other credits
|NumberOfTime60-89DaysPastDueNotWorse | 	Number of cased when customer has 60-89dpd (not worse) during the last 2 years
|NumberOfDependents | The number of customer dependents


Your **goal is to train a Random Forest classifier that predicts the target column (`SeriousDlqin2yrs`), tune the Random Forest hyperparameters, and test the performance of your classification model (use `recall` and `accuracy` to evaluate the performance.)

In [3]:
credit_scoring.head(3)

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.0,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0


In [4]:
X = credit_scoring.drop('SeriousDlqin2yrs', axis=1)
y = credit_scoring.SeriousDlqin2yrs

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [7]:
# your code here
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

num_features = ['age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse', 'MonthlyIncome', 'NumberOfDependents']
num_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

cat_features = []
cat_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore')) # onehot encoder can take: handle_unknwon='ignore'
])

feature_processor = ColumnTransformer(transformers=[
    ('num_processor', num_processor, num_features),
    ('cat_processor', cat_processor, cat_features)
])

pipe = Pipeline(steps=[
    ('feature_processor', feature_processor),
    ('forest', DecisionTreeClassifier(max_depth=5))
])

In [8]:
# your code here
from sklearn.model_selection import GridSearchCV
depth_list = [2,4,8,16]
param_dic = {'forest__max_depth': depth_list}

grid = GridSearchCV(pipe,param_dic, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [10]:
# your code here
from sklearn.metrics import mean_squared_error, mean_absolute_error, recall_score, accuracy_score
import numpy as np

best_pipe = grid.best_estimator_
y_test_pred = best_pipe.predict(X_test)


def TestPipe(y,y_pred):
    print('params:',grid.best_params_)
    # MSE
    print('MSE:',np.sqrt(mean_squared_error(y,y_pred)))
    # MAE
    print('MAE:',mean_absolute_error(y,y_pred))
    # Recall
    print('Recall:', recall_score(y,y_pred))
    # Accuracy
    print('Accuracy:', accuracy_score(y,y_pred))

TestPipe(y_test,y_test_pred)

params: {'forest__max_depth': 4}
MSE: 0.40741398236536547
MAE: 0.16598615302680633
Recall: 0.4294646346228996
Accuracy: 0.8340138469731937


Hmm, 83% accuracy is okay. 