In [174]:
###Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from IPython.display import Image
from IPython.core.display import HTML
import datetime
import math
import scipy.optimize as optimize
import statistics
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy import stats
import scipy
import warnings
from scipy.stats import norm
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split, KFold
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
warnings.filterwarnings('ignore')
plt.style.use('ggplot')

In [3]:
### data loading 
df = pd.read_csv('./data/loan.csv', low_memory=False)

In [4]:
###changing data type to appropriate date type 
df['last_pymnt_d']=pd.to_datetime(df.last_pymnt_d)
df['issue_d']=pd.to_datetime(df.issue_d)

In [5]:
###fill missing last_payments(no payments) with issue date, may want a separete column for this treatment
###This is for calculation of IRRs later
df['last_pymnt_d'].fillna(df.issue_d,inplace=True)

In [6]:
###Some pre-processing for ease of calculations
df['issue_yr'] = df.issue_d.dt.year
df['issue_mo']= df.issue_d.dt.month
df['last_pymnt_yr'] = df.last_pymnt_d.dt.year
df['last_pymnt_mo']= df.last_pymnt_d.dt.month
df['mo_diff'] = pd.to_numeric((df['last_pymnt_yr'] - 
                          df['issue_yr'])*12 + df['last_pymnt_mo'] -df['issue_mo'])

In [7]:
###Flag for completed loans
searchfor = ['Fully Paid', 'Charged Off', 'Default']
defaults = ['Charged Off', 'Default']
df['loan_completion_flag']=  np.where(df['loan_status'].str.contains('|'.join(searchfor)) ,1, np.nan)
###Flag for fully paid loans
df['fully_paid'] = np.where(df['loan_status'].str.contains('Fully Paid') ,1, 
                                  np.where(df['loan_status'].str.contains('|'.join(defaults)) ,0,np.nan))


In [8]:
###Example output of above manipulation
df.head()[['loan_status','loan_completion_flag','fully_paid']]

Unnamed: 0,loan_status,loan_completion_flag,fully_paid
0,Fully Paid,1.0,1.0
1,Charged Off,1.0,0.0
2,Fully Paid,1.0,1.0
3,Fully Paid,1.0,1.0
4,Current,,


In [9]:
###Average payment = Total payment - recoveries - last payment amount over the life -1 month of the investment
df['avg_pymnt'] = (df['total_pymnt']-df['recoveries']-df['last_pymnt_amnt'])/(np.maximum((df['mo_diff']-1),0))
###Treating infinites that appear when there is no payment or only 1 payment 
df['avg_pymnt'] = (df['avg_pymnt']).replace(np.Inf,0)
df['avg_pymnt'] = (df['avg_pymnt']).replace(-np.Inf,0)

In [10]:
###IRR calculations
###Input: a row of a dataframe with lending data 
def irr_calc(x):  
    ##varible initialization
    initial_invest = -x['funded_amnt']
    avg_payment = x['avg_pymnt']
    num_payments = np.max(int(x['mo_diff'])-1,0)
    recovery = x['recoveries'] -x['collection_recovery_fee']
    recovery_duration = np.maximum(36 - num_payments + 1 + 12,12)
    avg_recovery = recovery/recovery_duration
    last_payment_amount = x['last_pymnt_amnt']
    ###IRR calculation, input: series of cash flows, total payment - recoveries
    ###evenly divided and spread across the life of the loan and finally recovery and chargeoff fees
    return ((np.irr([initial_invest]+[avg_payment]*num_payments + [last_payment_amount] +
                    [avg_recovery]*recovery_duration)+1)**12-1)

In [11]:
###Calculating at a row level, individual security IRRs. Method will be faulty for loans that didn't mature.
###Warning: the calculation takes a fair amount of time ~few minutes
df['irr']=df.apply(irr_calc, axis=1)

In [12]:
##NaNs returned from IRRs with 0 payments should be -100% return 
df['irr']=df['irr'].replace(np.NaN,-1)

In [124]:
####Filter down to completed loans and has at least 36 months of possible history
df_filtered = df[df['loan_status'].str.contains('|'.join(searchfor))].query("term == ' 36 months' and issue_yr <=2012").copy()

# Linear Regression

In [129]:
##making grade flags
grade_flags = pd.get_dummies(df_filtered.grade) 
home_flag = pd.get_dummies(df_filtered.home_ownership) 
df_filtered=pd.concat([df_filtered,grade_flags,home_flag], axis=1)

In [131]:
df_filtered_clean = df_filtered[np.isfinite(df_filtered['annual_inc'])].copy()

In [158]:
columns= ["int_rate","annual_inc","funded_amnt","A","B","C","D","E","F","G"]

In [159]:
df_features = df_filtered_clean[columns]

In [160]:
y= df_filtered_clean.irr

In [161]:
X_train, X_test, y_train, y_test = train_test_split(df_features, y, test_size=0.2)

In [162]:
X_train.head()

Unnamed: 0,int_rate,annual_inc,funded_amnt,A,B,C,D,E,F,G
201470,14.33,70000.0,10850.0,0,0,1,0,0,0,0
23366,12.98,13200.0,4500.0,0,0,1,0,0,0,0
198574,17.77,35000.0,9750.0,0,0,0,1,0,0,0
13557,14.82,52000.0,2500.0,0,0,1,0,0,0,0
40981,12.53,30000.0,9000.0,0,1,0,0,0,0,0


In [163]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)

In [164]:
predictions = lm.predict(X_test)

In [165]:
model.score(X_test, y_test)

0.0071486877941574489

# Regression GBM

In [170]:
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

In [171]:
clf.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [172]:
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 0.0472


In [None]:
model.score(X_test, y_test)

In [175]:
nn = MLPRegressor(
    hidden_layer_sizes=(10,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)


In [176]:
n = nn.fit(X_test, y_test)

In [177]:
mse = mean_squared_error(y_test, nn.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 14.5887


In [178]:
mse = mean_squared_error(y_test, lm.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 0.0475
