In [1]:
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics

import pandas as pd

In [2]:
loans = pd.read_csv('full-loan-data.csv')
loans.sample(5)

Unnamed: 0,OCCUPATION,BUSINESS_TYPE,STATE,No_of_Mobile_No,GENDER,MARITAL_STATUS,REGION,BASIC_CURRENT,BASIC_SAVINGS,ATMCARD,TOTAL_PRODUCTS,Credit,Debit,balance,Loan Tenure,Loan Amount (Principal),Latest Known Status,Ever 90dpd+,Currently ≥ 60dpd,Bad Indicator
2214,Artisan,OTHERS,Edo,1,F,Married,WEST,N,N,Y,2,5556250.0,5390130.0,166120.0,11,350000,Current (active no arrears),0.0,0.0,0.0
1206,Self Employed,OTHERS,LAGOS,1,M,0,LAGOS_MAINLAND,Y,N,Y,2,1215896.29,1056829.49,159066.8,6,149000,In arrears,0.0,1.0,1.0
767,OTHERS,OTHERS,Lagos,1,F,Divorced,HEAD_OFFICE,N,N,Y,2,1282057.38,853095.73,428961.65,12,350000,Current (active no arrears),0.0,0.0,0.0
1025,BUSINESS,OTHERS,Kaduna,1,M,Married,LAGOS_MAINLAND,N,N,Y,2,1554000.0,1434930.5,119069.5,12,149000,Current (active no arrears),0.0,0.0,0.0
1188,Artisan,OTHERS,AKWA IBOM,1,M,0,SOUTH,N,N,Y,0,154774.67,228776.48,-74001.81,12,57000,Current (active no arrears),0.0,0.0,0.0


In [3]:
y = loans['Bad Indicator']
X = loans.drop(['Bad Indicator'], axis='columns')

## Dealing Categorical Features

In [4]:
categorical_columns = ['OCCUPATION', 'BUSINESS_TYPE', 'STATE', 'GENDER', 'MARITAL_STATUS', 'REGION', 'BASIC_CURRENT', 'BASIC_SAVINGS', 'ATMCARD']
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)]
)

# Split training and test datasets

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=31)

We will try the following:
* Logistic Regression
* Random Forest
* Gradient Boosting

and pick the best of one to deploy.

# Logistic Regression

In [6]:
lr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('poly', PolynomialFeatures(2, interaction_only=True)),
        ('classifier', LogisticRegression(C=0.2, max_iter=5000,
                                solver='liblinear', penalty='l2',
                                random_state=31))
    ])

# lr.fit(x_train, y_train)

# Random Forest

In [7]:
rf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=50, random_state=10,
                                              min_samples_leaf=2, max_depth=200,
                                              min_samples_split=2, n_jobs=-1))
    ])

# rf.fit(x_train, y_train)

# Gradient Boosting

In [8]:
gb = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(n_estimators=20, min_samples_leaf=2,
                                                  max_depth=30, random_state=31,
                                                  max_leaf_nodes=100))
    ])

# gb.fit(x_train, y_train)

We will first employ cross validation to choose an appropriate model.

In [9]:
models = [('LR', lr), ('RF', rf), ('GB', gb)]
scoring = 'accuracy'

for name, model in models:
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)
    result = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)

    print('{} -> Mean score: {} Std: {}'.format(name, result.mean(), result.std()))

LR -> Mean score: 0.7629706555403406 Std: 0.006274600817136129
RF -> Mean score: 0.7723961731810203 Std: 0.002826955924912132
GB -> Mean score: 0.7483967613568481 Std: 0.007473974260688902


It appears Random Forest model performs the best, so let us improve on it.
We will do this using Grid Search for hyperparameter tuning.

In [10]:
params = {
    "classifier__n_estimators": range(20, 60, 5),
    "classifier__min_samples_leaf": range(1, 5),
    "classifier__max_depth": range(15, 60, 3),
    "classifier__max_leaf_nodes": range(100, 400, 30)
}
gscv = GridSearchCV(rf, param_grid=params, cv=5, scoring='roc_auc').fit(x_train, y_train)

best_model = gscv.best_estimator_

In [11]:
best_model.score(x_test, y_test)

0.7269230769230769

In [12]:
best_model.predict_proba(x_test)

array([[0.84424062, 0.15575938],
       [0.7075282 , 0.2924718 ],
       [0.68252237, 0.31747763],
       [0.75508388, 0.24491612],
       [0.78434805, 0.21565195],
       [0.75229642, 0.24770358],
       [0.81464602, 0.18535398],
       [0.828745  , 0.171255  ],
       [0.75039507, 0.24960493],
       [0.72690928, 0.27309072],
       [0.72024082, 0.27975918],
       [0.78187537, 0.21812463],
       [0.72095068, 0.27904932],
       [0.75154872, 0.24845128],
       [0.77162891, 0.22837109],
       [0.72805738, 0.27194262],
       [0.54729497, 0.45270503],
       [0.75741557, 0.24258443],
       [0.77059296, 0.22940704],
       [0.77468544, 0.22531456],
       [0.7983567 , 0.2016433 ],
       [0.75162701, 0.24837299],
       [0.75229642, 0.24770358],
       [0.71873242, 0.28126758],
       [0.75749968, 0.24250032],
       [0.76107414, 0.23892586],
       [0.66980135, 0.33019865],
       [0.76329868, 0.23670132],
       [0.78253308, 0.21746692],
       [0.74650737, 0.25349263],
       [0.

* The column with heading 0 is the probability that the loan is a good loan
* The column with heading 1 is the probability that the loan is a bad loan

## Export model artefact to file

In [13]:
pickle.dump(best_model, open('best-model', 'wb'))

## Model Improvement
The following are ways in which the model can be improved:
* Scaling the dataset (e.g Min-Max scaling)
* Providing more features, e.g. the interest to be paid on the loan.
* A larger dataset

# Model Monitoring and Maintenance in Production

Constant re-training of the model is important to keep the model updated. This can be ensured by:
* Collecting performance metrics (accuracy, roc-auc, etc.) and using that to make the model better.
* Scaling the API using Kubernetes as requests increase.
* Model versioning would be used to maintain model across versions so that rollback can be appropriately done if required.