In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import pickle

In [None]:
df = pd.read_csv('loss_dataset.csv')

In [None]:
df.shape

## Preprocessing 

In [None]:
df['ChargeOffDate'] = pd.to_datetime(df['ChargeOffDate'])
df['ApprovalDate'] = pd.to_datetime(df['ApprovalDate'])
df['loan_age'] = (df['ChargeOffDate'] - df['ApprovalDate'])/np.timedelta64(1, 'M')
df['loss_pcrt'] = df['GrossChargeOffAmount'] / df['GrossApproval']

In [None]:
df = df.drop(columns=['Unnamed: 0','LoanStatus','log_amount','end_date','quarter_index','ApprovalDate','ChargeOffDate',
                      'ChargeOffDate','time','status','id','death','time_start','time_end','ApprovalYear','EndYear',
                     'BorrState','indicator_Leverage','indicator_HPI','indicator_IndustryGDP',
                     'indicator_UnemploymentRate', 'indicator_PersonalIncome',
                     'indicator_GSP','indicator_NaicsCode'])

In [None]:
df = df.drop(columns=['Unnamed: 0.1'])

In [None]:
df = pd.get_dummies(df)

In [None]:
df.columns

## Training 

In [None]:
def format_rocovery(amount):
    if amount==0:
        return 1
    else:
        return 0

In [None]:
df['Fully_Recovery'] = df.apply(lambda row:format_rocovery(row['GrossChargeOffAmount']),axis=1)

In [None]:
df = df.reindex(np.random.permutation(df.index))

In [None]:
df.shape

In [None]:
df_train = df.iloc[:-1000,:]
df_test = df.iloc[-1000:,:]

### Model 1 

In [None]:
X_train = df_train.drop(columns=['Fully_Recovery','GrossChargeOffAmount','loss_pcrt','GrossApproval'])
Y_train = df_train['loss_pcrt']
X_test = df_test.drop(columns=['Fully_Recovery','GrossChargeOffAmount','loss_pcrt','GrossApproval'])
Y_test = df_test['loss_pcrt']

In [None]:
grid = { 
    'n_estimators': [200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [7,8,10],
    'min_samples_leaf': [6,10]
}

CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), 
                      param_grid=grid, 
                      cv= 5)
CV_rfr = CV_rfr.fit(X_train, Y_train)

In [None]:
regr = CV_rfr.best_estimator_
regr = regr.fit(X_train, Y_train)

prediction = regr.predict(X_test)
mse = mean_squared_error(Y_test, prediction)
mse**.5

### Model 2 

#### Recovery Probability Model 

In [None]:
X = df_train.drop(columns=['Fully_Recovery','GrossChargeOffAmount','loss_pcrt','GrossApproval'])
Y = df_train['Fully_Recovery']
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.9)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, Y_train = pipeline.fit_resample(X_train, Y_train)

In [None]:
grid = { 
    'n_estimators': [200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [7,8,10],
    'min_samples_leaf': [6,10]
}

CV_clf = GridSearchCV(estimator=RandomForestClassifier(), 
                      param_grid=grid, 
                      cv= 5,
                      scoring = 'roc_auc')
CV_clf = CV_clf.fit(X_train, Y_train)

In [None]:
clf = CV_clf.best_estimator_
clf = clf.fit(X_train, Y_train)

In [None]:
def calc_precision_recall(y_true, y_pred):   
    y_pred = pd.Series(y_pred, index=y_true.index)
    TP = 0
    FP = 0
    FN = 0
    for i in y_true.index: 
        if y_true[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_true[i]!=y_pred[i]:
           FP += 1
        if y_pred[i]==0 and y_true[i]!=y_pred[i]:
           FN += 1
    try:
        precision = TP / (TP + FP)
    except:
        precision = 1    
    try:
        recall = TP / (TP + FN)
    except:
        recall = 1

    return precision, recall

In [None]:
y_test_probs = clf.predict_proba(X_valid)[:, 1]
precision_scores = []
recall_scores = []
probability_thresholds = np.linspace(0, 1, num=100)
for p in probability_thresholds:
    y_test_preds = []
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)            
    precision, recall = calc_precision_recall(Y_valid, y_test_preds)        
    precision_scores.append(precision)
    recall_scores.append(recall)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recall_scores, precision_scores, label='Random Forest Classfier')
baseline = len(Y_valid[Y_valid==1]) / len(Y_valid)
ax.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
ax.set_xlabel('Recall',fontsize = 14)
ax.set_ylabel('Precision',fontsize=14)
ax.legend(loc='center left')
plt.title('Precision-recall Curve',fontsize=16)

In [None]:
print(round(auc(recall_scores, precision_scores),2))

In [None]:
prediction = clf.predict_proba(X_valid)[:,1]
y_pred = [0 if x < 0.9 else 1 for x in prediction]
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_valid, y_pred)

In [None]:
clf = clf.fit(X, Y)

### Loss Model

In [None]:
X = df_train.drop(columns=['Fully_Recovery','GrossChargeOffAmount','loss_pcrt','GrossApproval'])
Y = df_train['loss_pcrt']
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
grid = { 
    'n_estimators': [200],
    'max_features': ['sqrt','log2'],
    'max_depth' : [7,8,10],
    'min_samples_leaf': [6,10]
}

CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), 
                      param_grid=grid, 
                      cv= 5)
CV_rfr = CV_rfr.fit(X_train, Y_train)

In [None]:
regr = CV_rfr.best_estimator_
regr = regr.fit(X_train, Y_train)

prediction = regr.predict(X_valid)
mse = mean_squared_error(Y_valid, prediction)
mse**.5

In [None]:
regr = regr.fit(X, Y)

### Combine 

In [None]:
X_test = df_test.drop(columns=['Fully_Recovery','GrossChargeOffAmount','loss_pcrt','GrossApproval'])
Y_test = df_test['loss_pcrt']

In [None]:
def get_mean_squared_error(Y_loss_test, prediction_recovery,prediction_loss,threshold):
    y_pred_recovery = [1 if x <threshold else 0 for x in prediction_recovery]
    y_pred_loss = np.array(y_pred_recovery) * np.array(prediction_loss)
    mse = mean_squared_error(Y_loss_test,y_pred_loss)
    return mse**.5

In [None]:
prediction_recovery = clf.predict_proba(X_test)[:,1]
prediction_loss = regr.predict(X_test)
get_mean_squared_error(Y_test, prediction_recovery,prediction_loss,0.5)

In [None]:
prediction_recovery = clf.predict_proba(X_test)[:,1]
prediction_loss = regr.predict(X_test)
rmse_scores = []
mini = 1000000000
best_p = 1
probability_thresholds = np.linspace(0, 1, num=100)
for p in probability_thresholds:
    y_pred_recovery = [1 if x <p else 0 for x in prediction_recovery]
    y_pred_loss = np.array(y_pred_recovery) * np.array(prediction_loss)
    mse = mean_squared_error(Y_test,y_pred_loss)
    rmse_scores.append(mse**.5)
    if mse**.5<mini:
        mini = mse**.5
        best_p = p
mini

In [None]:
best_p

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax.plot(probability_thresholds,rmse_scores)
ax.set_xlabel('thresholds',fontsize = 14)
ax.set_ylabel('rmse',fontsize = 14)
plt.title('RMSE Score by Threshold', fontsize = 16)

### Save Model

In [None]:
filename = 'classifier_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
filename = 'regressor_model.sav'
pickle.dump(regr, open(filename, 'wb'))