In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import pickle
sns.set()
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

Importing Packages and Dataset

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
dfloan=pd.read_csv('LoanExport.csv')
print(dfloan.isnull().sum().sum())
print(dfloan.shape)
print(dfloan.columns)

24994
(291451, 28)
Index(['CreditScore', 'FirstPaymentDate', 'FirstTimeHomebuyer', 'MaturityDate',
       'MSA', 'MIP', 'Units', 'Occupancy', 'OCLTV', 'DTI', 'OrigUPB', 'LTV',
       'OrigInterestRate', 'Channel', 'PPM', 'ProductType', 'PropertyState',
       'PropertyType', 'PostalCode', 'LoanSeqNum', 'LoanPurpose',
       'OrigLoanTerm', 'NumBorrowers', 'SellerName', 'ServicerName',
       'EverDelinquent', 'MonthsDelinquent', 'MonthsInRepayment'],
      dtype='object')


1. EDA & Feature Engineering

a.Creating Target Variables

In [None]:
# Changing  FirstPaymentDate and MaturityDate into date format
dfloan['FirstPaymentDate'] = pd.to_datetime(dfloan['FirstPaymentDate'], format='%Y%m')
dfloan['MaturityDate'] = pd.to_datetime(dfloan['MaturityDate'], format='%Y%m')

b. creating New Columns for Target

In [None]:
# Arranging Credit Score into range values and assigning labels

count=(dfloan['CreditScore'] == 0).sum()
print('count of zeros in the column of CreditScore:', count)
print('Mean of the CreditScore :', (dfloan['CreditScore'].mean()))
# replacing the record '0'  with '850'
dfloan['CreditScore'] = dfloan['CreditScore'].replace(0,708)

# creating a new column 'creditrange'
dfloan['CreditRange'] = pd.cut(dfloan['CreditScore'], bins = [ 0, 650, 700, 750, 900 ], labels = ['poor', 'fair', 'good', 'excellent'])

# Creating a new column 'LTVRange'
dfloan['LTVRange'] = pd.cut(dfloan['LTV'], bins = [ 0, 25, 50, 100 ], labels = ['low', 'medium', 'high'])

# Creating a new column 'LoanTenure'
dfloan['RepayRange'] = pd.cut(dfloan['MonthsInRepayment'], bins = [ 0, 48, 96, 144, 192, 240 ], labels = ['0-4yrs', '4-8yrs', '8-12yrs', '12-16yrs', '16-20yrs'])
dfloan['LoanTenure'] = (dfloan['MaturityDate'] - dfloan['FirstPaymentDate']).dt.days
dfloan = dfloan.astype({"LoanTenure": float})
dfloan['LoanTenure'] = dfloan['LoanTenure'].div(365)

count of zeros in the column of CreditScore: 1189
Mean of the CreditScore : 708.9369911237224


c.calculating MonthlyEMI, TotalLoanAmount, TotalLoanInterest

In [None]:
# calculating monthly EMI
def emi(p, r, t):
    # for one month interest
    r = r/(12*100)
    emi = (p*r) * (1+r)**t/(((1+r)**t)-1)
    return (emi)

# Calculating monthly EMI
dfloan['Monthy_EMI'] = dfloan.apply(lambda row: emi(row['OrigUPB'],row['OrigInterestRate'],row['OrigLoanTerm']),axis=1)

# Total Accured amount(principal + Interest)
dfloan['Total_Loan_Amt'] = round(dfloan.Monthy_EMI * dfloan.OrigLoanTerm)

# Total interest payable
dfloan['Total_loan_Int'] = dfloan.Total_Loan_Amt - dfloan.OrigUPB

d. Calculating Monthly and Annual income

In [None]:
# calculating monthly income from Dti ratio
dfloan['monthly_income'] = round(dfloan.Monthy_EMI / (dfloan.DTI/100))

# calculating Annual income from Dti ratio
dfloan['Annual_income'] = round(dfloan.monthly_income * 12)

e. Calculating prepayment Amount

In [None]:
# prepayexpected amount
dfloan_dti=dfloan[dfloan["DTI"]!=0]

# defining DTI ratio for prepayment
def dti(ratio,income):
  if ratio < 40:
    return (income/2)
  else:
    return (income*0.75)

dfloan['Prepayment_amt'] = dfloan_dti.apply(lambda row: dti(row['DTI'], row['monthly_income']*24), axis=1)
dfloan['Prepayment_amt'] = round(dfloan['Prepayment_amt']-(dfloan['Monthy_EMI'])*24)

f.Calculating Change in Payment

In [None]:
# Caluculating current unpaid balance
dfloan['mon_int_rate'] = dfloan['OrigInterestRate']/(12*100)

def ubp(mon_int_rate,loanamt,emi,month):
  for i in range(month):
    month_int = loanamt * mon_int_rate
    prin_amt= emi - month_int
    loanamt-= prin_amt
  return loanamt

# creating a column for Current Balance
dfloan['current_UBP'] = round(dfloan.apply(lambda row: ubp(row['mon_int_rate'],row['OrigUPB'],row['Monthy_EMI'],row['MonthsInRepayment']),axis=1))

dfloan["new_principal"]= round(dfloan["current_UBP"]-dfloan["Prepayment_amt"])
print(dfloan['Prepayment_amt'].mean())
print(dfloan['new_principal'].mean())

18744.219502658456
97844.52504796186


In [None]:
dfloan['Prepayment_amt']= dfloan['Prepayment_amt'].fillna(18744.009)
dfloan['new_principal'] = dfloan['new_principal'].fillna(97844.734)

g.Caluculating Tenure Before Prepayment and After Prepayment

In [None]:
import cmath
from math import log

def tenure(mon_int_rate,amount,emi):
  try:
    tenure = (log(emi) - log(emi-(amount*mon_int_rate))) / (log(1+mon_int_rate))
    return tenure
  except ValueError:
    return None

dfloan['pres_tenure']= round(dfloan.apply(lambda row: tenure(row['mon_int_rate'], row['new_principal'], row['Monthy_EMI']),axis=1))

#replace all NaN values with zeros
dfloan['pres_tenure'] = dfloan['pres_tenure'].fillna(0)
#convert 'rebounds' column from float to integer
dfloan['pres_tenure'] = dfloan['pres_tenure'].astype(int)

# Creating new tunure for months in repayment
dfloan['new_tenure']=dfloan['pres_tenure']+dfloan['MonthsInRepayment']

# Defining a function to create new interest with change in tenure
def new_int1(mon_int_rate,amount,emi,month):
  t1=0
  for i in range(month):
    interest=mon_int_rate*amount
    p=emi-interest
    amount-=p
    t1+=interest
  return t1
def new_int2(mon_int_rate,amount,emi,month):
  t1=0
  month= int(month)
  for i in range(month-1):
    interest=mon_int_rate*amount
    p=emi-interest
    amount-=p
    t1+=interest
  return t1

# creating interst amount before prepayment and After prepayment
dfloan['int_B_pre']= round(dfloan.apply(lambda row: new_int1(row['mon_int_rate'], row['OrigUPB'], row['Monthy_EMI'],row['MonthsInRepayment']),axis=1))
dfloan['int_A_pre']= round(dfloan.apply(lambda row: new_int2(row['mon_int_rate'], row['new_principal'],row['Monthy_EMI'], row['pres_tenure']),axis=1))

# Caluculating the total interest interest
dfloan['new_tot_int']=dfloan['int_B_pre']+dfloan['int_A_pre']


h.Calculating ROI and PROI

In [None]:
# caluculating return of interest based on the initial interest and loan amount
dfloan['ROI']= (dfloan['Total_loan_Int']/ dfloan['Total_Loan_Amt'])*100

# caluculating the loss with change of new interest
dfloan["loss_amt"]=dfloan['Total_loan_Int']-dfloan['new_tot_int']

# calculating Prefered ROI based on prepayment and new tenure
dfloan['Prefered_ROI']=((dfloan['new_tot_int']+(dfloan['loss_amt']/2))/(dfloan['new_tot_int']+(dfloan['loss_amt']/2)+dfloan['OrigUPB']))*100

2. X , y split

In [None]:
# Dropping the features
dfloan = dfloan.drop(['FirstPaymentDate','FirstTimeHomebuyer','LoanPurpose','MaturityDate','MIP','Units','LoanTenure','MSA','NumBorrowers','Occupancy','Channel','PPM','OCLTV','PropertyState','PropertyType','PostalCode','ProductType','LoanSeqNum','SellerName','ServicerName','MonthsDelinquent'],axis = 1)

In [None]:
X=dfloan.drop(['EverDelinquent','Prefered_ROI','Prepayment_amt'],axis=1)
y=dfloan[['EverDelinquent','Prefered_ROI','Prepayment_amt']]

for colname in X.select_dtypes('float64'):
    X[colname], _ = X[colname].factorize()
discrete_features = X.dtypes == np.int64

In [None]:
# Label encoding
le=LabelEncoder()
dfloan['CreditRange'] = le.fit_transform(dfloan['CreditRange'])
dfloan['RepayRange'] = le.fit_transform(dfloan['RepayRange'])
dfloan['LTVRange'] = le.fit_transform(dfloan['LTVRange'])

3. Feature Selection

In [None]:
# Calculating MI scores for data set Features
xx=dfloan.drop(['EverDelinquent'],axis=1)
yy=dfloan['EverDelinquent']

for colname in xx.select_dtypes('float64'):
    xx[colname], _ = xx[colname].factorize()
discrete_features = xx.dtypes == np.int64


from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(xx, yy)
mi_scores = pd.Series(mi_scores,name="MI Scores", index=xx.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

In [None]:
# A function to select highly correlated features.
def Correlation(dataset, threshold):
    correltated_features = set() # as a container of highly correlated features
    correlation_matrix = dataset.corr()
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                column_name = correlation_matrix.columns[i]
                correltated_features.add(column_name)
    return correltated_features

Correlation(X,0.8)

{'Annual_income', 'ROI', 'Total_Loan_Amt', 'Total_loan_Int', 'mon_int_rate'}

In [None]:
# Droping these features from dataset
df_loan = dfloan.drop(['mon_int_rate','monthly_income','current_UBP','LTV','LTVRange', 'RepayRange','new_principal', 'pres_tenure','int_B_pre','Annual_income',
       'int_A_pre','loss_amt','new_tot_int'],axis=1)

In [None]:
df_loan.columns

Index(['CreditScore', 'DTI', 'OrigUPB', 'OrigInterestRate', 'OrigLoanTerm',
       'EverDelinquent', 'MonthsInRepayment', 'CreditRange', 'Monthy_EMI',
       'Total_Loan_Amt', 'Total_loan_Int', 'Prepayment_amt', 'new_tenure',
       'ROI', 'Prefered_ROI'],
      dtype='object')

In [None]:
df_loan.head(2)

Unnamed: 0,CreditScore,DTI,OrigUPB,OrigInterestRate,OrigLoanTerm,EverDelinquent,MonthsInRepayment,CreditRange,Monthy_EMI,Total_Loan_Amt,Total_loan_Int,Prepayment_amt,new_tenure,ROI,Prefered_ROI
0,708,27,117000,6.75,360,0,52,2,758.859773,273190.0,156190.0,15519.0,271,57.172664,52.666552
1,708,17,109000,6.5,360,0,144,2,688.954146,248023.0,139023.0,32101.0,250,56.052463,51.815751


4. Train, Test Split

In [None]:
#Dividing Data in test and train
X=df_loan.drop(['EverDelinquent','Prefered_ROI','Prepayment_amt'],axis=1)
y=df_loan[['EverDelinquent','Prefered_ROI','Prepayment_amt']]

X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state=56)

# Separating Target values for classifications and regression problems
y_class_train = y_train.iloc[:,0]
y_class_test = y_test.iloc[:,0]

y_reg_test = y_test.iloc[:,1:]
y_reg_train = y_train.iloc[:,1:]

In [None]:
X_train.columns

Index(['CreditScore', 'DTI', 'OrigUPB', 'OrigInterestRate', 'OrigLoanTerm',
       'MonthsInRepayment', 'CreditRange', 'Monthy_EMI', 'Total_Loan_Amt',
       'Total_loan_Int', 'new_tenure', 'ROI'],
      dtype='object')

5. Data Preprocessing

In [None]:
#Applying SMOTE for handling imbalance data
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline
smote= SMOTE()
scaler = MinMaxScaler()

6. Classification Pipeline

In [None]:
## Creating pipelines for Random Forest Classifeir
#Random Forest Pipeline
#from imblearn.ensemble import RandomForestClassifier

classifier_Pipe = Pipeline([
    ('scaler',scaler),
    ('randomforest',RandomForestClassifier(class_weight="balanced", max_depth=9))
     ])


In [None]:
classifier_Pipe.fit(X_train, y_class_train)

In [None]:
pred_class = classifier_Pipe.predict(X_test)
print('test accuracy = ', round(accuracy_score(y_class_test, pred_class)*100, 2), '%')

test accuracy =  69.94 %


In [None]:
print("Classification Report:")

print("\nAccuracy score:\n", round(accuracy_score(y_class_test, pred_class)*100,2), '%')
print('*'*40)
print("\nConfusion Matrix:\n", confusion_matrix(y_class_test, pred_class))
print('*'*40)
print("\nClassification Report:\n", classification_report(y_class_test, pred_class))

Classification Report:

Accuracy score:
 69.94 %
****************************************

Confusion Matrix:
 [[33177 13571]
 [ 3952  7591]]
****************************************

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.71      0.79     46748
           1       0.36      0.66      0.46     11543

    accuracy                           0.70     58291
   macro avg       0.63      0.68      0.63     58291
weighted avg       0.79      0.70      0.73     58291



7. Regression Pipeline

In [None]:
from sklearn.linear_model import LinearRegression
Regression_pipe = Pipeline([
     ('scaler',scaler),
     ('Regressor',LinearRegression())
    ])

In [None]:
from sklearn.metrics import mean_squared_error

# fit and transform the pipeline
Regression_pipe.fit(X_train, y_reg_train)

# predict using the pipeline
pred_test_lass = Regression_pipe.predict(X_test)

#print('R squared training set', round(Lasso.score(y_reg_train,pred_train_lass)*100, 2))
print('R squared test set', round(r2_score(y_reg_test,pred_test_lass)*100, 2))

R squared test set 67.63


In [None]:
print(np.sqrt(mean_squared_error(y_reg_test,pred_test_lass)))
print(r2_score(y_reg_test, pred_test_lass)*100,2)

12733.57781385281
67.6295473276995 2


8. Saving Model

In [None]:
pip install mgzip

In [None]:
import mgzip
with mgzip.open(r'C:\Users\lenovo\prepaymentrisk\classifierPipe', 'wb') as f:
    pickle.dump(classifier_Pipe, f)

In [None]:
pickle.dump(Regression_pipe, open('Regression_pipe.pkl','wb'))