# Importing required Python packages 


In [1]:
import matplotlib.pylab as plt
import numpy as np
from scipy import sparse
from sklearn.datasets import make_classification, make_blobs, load_boston
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit, train_test_split,learning_curve, KFold, cross_val_score, GridSearchCV
from sklearn import metrics, utils, preprocessing
#from sklearn.model_selection.learning_curve import learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from pprint import pprint
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import urllib
import seaborn
import statsmodels.api as sm
%matplotlib inline
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

# Reading the dataset

In [2]:
loan = pd.read_csv('loan.csv', low_memory=False)
loanbk=loan.copy() 
loan.describe()

bad_loan = ["Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period", 
            "Late (16-30 days)", "Late (31-120 days)"]

# Defining a new column loan_condition to qualify good/ bad loans
loan['loan_condition'] = np.nan


def loan_condition(status):
    if status in bad_loan:
        return 'Bad Loan'
    else:
        return 'Good Loan'
    
#calling function using data frame  
loan['loan_condition'] = loan['loan_status'].apply(loan_condition)

loan = loan[loan['loan_condition']=='Bad Loan']


# Data Cleansing, Pre-Processing and Feature Engineering

In [3]:
total_num = loan.isnull().sum().sort_values(ascending=False)

perc = loan.isnull().sum()/loan.isnull().count() *100
perc1 = (round(perc,2).sort_values(ascending=False))

# Creating a data frame:
df_miss = pd.concat([total_num, perc1], axis =1 , keys =["Total Missing Values", "Percentage %"]).sort_values(by ="Percentage %", ascending = False)
top_mis = df_miss[df_miss["Percentage %"]>0]
top_mis.reset_index(inplace=True)

loan.drop(top_mis[top_mis["Percentage %"]>75]["index"], axis = 1, inplace=True)
loan.drop(["next_pymnt_d","mths_since_last_delinq"], axis = 1, inplace=True)
loan.drop(["last_pymnt_d", "last_credit_pull_d", "earliest_cr_line", "issue_d", "addr_state"], axis =1 , inplace = True)
loan["tot_cur_bal"] = loan["tot_cur_bal"].fillna(loan["tot_cur_bal"].mean())

loan["tot_coll_amt"] = loan["tot_coll_amt"].fillna(0)

loan["emp_length"]=loan["emp_length"].fillna(0)

loan["revol_util"]= loan["revol_util"].fillna(loan["revol_util"].mean())

loan["collections_12_mths_ex_med"]= loan["collections_12_mths_ex_med"].fillna(loan["collections_12_mths_ex_med"].mean())

loan["acc_now_delinq"]=loan["acc_now_delinq"].fillna(loan["acc_now_delinq"].mode())

loan["delinq_2yrs"]=loan["delinq_2yrs"].fillna(0)

loan["inq_last_6mths"]= loan["inq_last_6mths"].fillna(0)

loan["pub_rec"]= loan["pub_rec"].fillna(0)

loan["acc_now_delinq"]= loan["acc_now_delinq"].fillna(0)

loan["open_acc"]= loan["open_acc"].fillna(loan["open_acc"].mean())

loan["total_rev_hi_lim"] = loan["total_rev_hi_lim"].fillna(loan["total_rev_hi_lim"].mean())

loan["total_acc"]=loan["total_acc"].fillna(loan["total_acc"].mean())

loan["annual_inc"] = loan["annual_inc"].fillna(loan["annual_inc"].mean())

#OK let's take a look at the columns and see if there are any we can drop any before we get started.
loan.columns.values

#There're plenty that don't seem very relevant. Let's drop them.
loan=loan.drop(['member_id', 'grade', 'sub_grade', 'emp_title',
          'pymnt_plan', 'url', 'title', 'initial_list_status',
           'last_pymnt_amnt','policy_code', 'emp_length','zip_code','desc', 'loan_condition'], axis=1)

loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67429 entries, 1 to 887371
Data columns (total 35 columns):
id                            67429 non-null int64
loan_amnt                     67429 non-null float64
funded_amnt                   67429 non-null float64
funded_amnt_inv               67429 non-null float64
term                          67429 non-null object
int_rate                      67429 non-null float64
installment                   67429 non-null float64
home_ownership                67429 non-null object
annual_inc                    67429 non-null float64
verification_status           67429 non-null object
loan_status                   67429 non-null object
purpose                       67429 non-null object
dti                           67429 non-null float64
delinq_2yrs                   67429 non-null float64
inq_last_6mths                67429 non-null float64
open_acc                      67429 non-null float64
pub_rec                       67429 non-null floa

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [4]:
# Get rid of non-numeric values throughout the DataFrame:
for col in loan.columns.values:
    loan[col] = loan[col].replace('[^0-9]+.-', '', regex=True)
loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67429 entries, 1 to 887371
Data columns (total 35 columns):
id                            67429 non-null int64
loan_amnt                     67429 non-null float64
funded_amnt                   67429 non-null float64
funded_amnt_inv               67429 non-null float64
term                          67429 non-null object
int_rate                      67429 non-null float64
installment                   67429 non-null float64
home_ownership                67429 non-null object
annual_inc                    67429 non-null float64
verification_status           67429 non-null object
loan_status                   67429 non-null object
purpose                       67429 non-null object
dti                           67429 non-null float64
delinq_2yrs                   67429 non-null float64
inq_last_6mths                67429 non-null float64
open_acc                      67429 non-null float64
pub_rec                       67429 non-null floa

In [5]:
#Remove "months" from the loan period
loan.term=loan.term.astype(str).str.split(' ',2).str[1]

loan['application_type'] = loan['application_type'].map({'INDIVIDUAL': 0, 'JOINT': 1})

In [6]:
#Let's change the Income Verified column, which currently has textual labels to numeric.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
loan.verification_status = le.fit_transform(loan.verification_status.values)
loan.home_ownership=le.fit_transform(loan.home_ownership.values)
loan.loan_status=le.fit_transform(loan.loan_status.values)
loan.purpose=le.fit_transform(loan.purpose.values)

#Finally let's be sure we convert all fields to numeric
loan.apply(pd.to_numeric, errors = 'coerce')
loan.head(2)
loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67429 entries, 1 to 887371
Data columns (total 35 columns):
id                            67429 non-null int64
loan_amnt                     67429 non-null float64
funded_amnt                   67429 non-null float64
funded_amnt_inv               67429 non-null float64
term                          67429 non-null object
int_rate                      67429 non-null float64
installment                   67429 non-null float64
home_ownership                67429 non-null int32
annual_inc                    67429 non-null float64
verification_status           67429 non-null int32
loan_status                   67429 non-null int32
purpose                       67429 non-null int32
dti                           67429 non-null float64
delinq_2yrs                   67429 non-null float64
inq_last_6mths                67429 non-null float64
open_acc                      67429 non-null float64
pub_rec                       67429 non-null float64


# Finding correlations

In [7]:
X = loan.drop("int_rate",1)   #Feature Matrix
y = loan["int_rate"]          #Target Variable
loan.head()

cor = loan.corr()

#Correlation with output variable
cor_target = abs(cor["int_rate"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
print(relevant_features)

int_rate    1.0
Name: int_rate, dtype: float64


# Filtering through p-value

In [8]:
#Adding constant column of ones, mandatory for sm.OLS model
X_1 = sm.add_constant(X)

#Fitting sm.OLS model
model = sm.OLS(y,X_1.astype(float)).fit()


p = model.pvalues

In [9]:
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1.astype(float)).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'installment', 'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']


In [10]:
print(len(selected_features_BE))

loanbk = loan.copy()
for column in loanbk:
    if(column not in selected_features_BE):
        if(column not in 'int_rate'):
            loan=loan.drop([column], axis=1)

print(loan.info())

31
<class 'pandas.core.frame.DataFrame'>
Int64Index: 67429 entries, 1 to 887371
Data columns (total 32 columns):
id                            67429 non-null int64
loan_amnt                     67429 non-null float64
funded_amnt                   67429 non-null float64
funded_amnt_inv               67429 non-null float64
term                          67429 non-null object
int_rate                      67429 non-null float64
installment                   67429 non-null float64
home_ownership                67429 non-null int32
annual_inc                    67429 non-null float64
verification_status           67429 non-null int32
purpose                       67429 non-null int32
dti                           67429 non-null float64
delinq_2yrs                   67429 non-null float64
inq_last_6mths                67429 non-null float64
open_acc                      67429 non-null float64
pub_rec                       67429 non-null float64
revol_bal                     67429 non-null flo

# Splitting data into train dataset and validate dataset 

In [11]:
#OK great, let's now get our X and y. We know that interest rate is y.
#Pandas is fantastic, all you need to do is use .values to get the data in numpy format
y = loan["int_rate"]          #Target Variable

#Let's remove y from the df so we can get X
X = loan.drop("int_rate",1)   #Feature Matrix

#Now, the train test split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Linear Regression Model

In [12]:

#Alright let's now fit a linear regression model to the training set
linr=LinearRegression().fit(X_train, y_train)
predictions_regression_model = linr.predict(X_test)

#Alright let's get the parameters we've learned
print("Coefficients (theta_1..theta_n)" ) 
print (linr.coef_)
print()
print ("Y Intercept(theta0)")
print (linr.intercept_)
print()

#Let's also spit out the R-squared values for Train and Test
print ("R-squared for Train: %.2f" %linr.score(X_train, y_train))
print ("R-squared for Test: %.2f" %linr.score(X_test, y_test))

#There we have it, the R-squared value on the test set is about 64%, which is not great but understandable considering
#the data must be much more sophisticated than a straight line. The only other thing we can do with this regressor is
#to normalize the data before training (value - mean /std) so all values are in the same range from 0 to 1. 
#Let's try this in the next step

Coefficients (theta_1..theta_n)
[ 9.47011721e-09 -1.80064414e-04 -1.24662871e-03  1.28832994e-04
  3.27729551e-01  4.18168090e-02  7.88827782e-02 -1.36456398e-06
  2.15567758e-01  1.60299682e-01  2.49902263e-02  1.38133077e-01
  3.70841527e-01  4.50305095e-02  1.56501964e-01  6.06628537e-06
  1.96759802e-02 -2.62057199e-02  3.94109003e-03 -3.95953148e-03
  9.56396586e-04 -4.67733859e-05 -1.37671788e-03 -3.22952999e-03
 -7.94560585e-04 -5.03808834e-04  4.67034232e-01  6.61508913e-01
  2.82539035e-05 -1.35052453e-06 -1.22428685e-05]

Y Intercept(theta0)
-1.1007685337479653

R-squared for Train: 0.65
R-squared for Test: 0.65


In [13]:
#Let's now fit a linear regression model with normalize=True to the training set
linr=LinearRegression(normalize=True).fit(X_train, y_train)

#Alright let's get the parameters we've learned

print("Coefficients (theta_1..theta_n)" ) 
print (linr.coef_)
print()
print ("Y Intercept(theta0)")
print (linr.intercept_)
print()

#Let's also spit out the R-squared values for Train and Test

print ("R-squared for Train: %.2f" %linr.score(X_train, y_train))
print ("R-squared for Test: %.2f" %linr.score(X_test, y_test))
#There we have it, the R-squared value on the test set is about 71%, actually a bit less than what we managed before.

#I will explore more sophisticated regressors in a seperate post.

Coefficients (theta_1..theta_n)
[ 9.47011721e-09 -1.80064414e-04 -1.24662871e-03  1.28832994e-04
  3.27729551e-01  4.18168090e-02  7.88827782e-02 -1.36456398e-06
  2.15567758e-01  1.60299682e-01  2.49902263e-02  1.38133077e-01
  3.70841527e-01  4.50305095e-02  1.56501964e-01  6.06628537e-06
  1.96759802e-02 -2.62057199e-02  3.94109003e-03 -3.95953148e-03
  9.56396586e-04 -4.67733859e-05 -1.37671788e-03 -3.22952999e-03
 -7.94560585e-04 -5.03808834e-04  4.67034232e-01  6.61508913e-01
  2.82539035e-05 -1.35052453e-06 -1.22428685e-05]

Y Intercept(theta0)
-1.1007685337480009

R-squared for Train: 0.65
R-squared for Test: 0.65


# Random Forest Model

In [14]:
from sklearn.ensemble import RandomForestRegressor


regr = RandomForestRegressor(max_depth=2, random_state=0,
                              n_estimators=100)
regr.fit(X_train, y_train)
predictions_rf_model = regr.predict(X_test)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)
print(regr.feature_importances_)

print(predictions_rf_model)

[0.         0.         0.         0.         0.79706907 0.0988875
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.10404343]
[17.71065982 17.71065982 13.47923073 ... 15.26143107 13.47923073
 15.26143107]


In [15]:
print(regr.score(X_train,y_train))
print(regr.score(X_test,y_test))

0.2642395326968977
0.24943463576526015


# MAPE (Mean Absolute Percentage Error)

In [16]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [17]:
mape_regr = mean_absolute_percentage_error(predictions_regression_model, y_test)
print("MAPE of Regression Model",mape_regr)

mape_rf = mean_absolute_percentage_error(predictions_rf_model, y_test)
print("MAPE of Random Forest",mape_rf)

MAPE of Regression Model 13.392537891962132
MAPE of Random Forest 19.264766225789618


#  5-fold cross validation

In [18]:
#Linear Regresssion
x_lr = loan.drop("int_rate",1)
y_lr = loan["int_rate"] 

loans_full_linear = cross_val_score(LinearRegression(), x_lr, y_lr,  cv = KFold(5))
print("Accuracy of all 5 Linear Regression models:", loans_full_linear)
print("Accuracy of Linear Regression: %.2f"%(np.mean(loans_full_linear)*100), "%")


#Random Forest
regr = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=10)
loans_full_rf = cross_val_score(regr, x_lr, y_lr,  cv = KFold(5))
print("Accuracy of all 5 Random Forest models:", loans_full_rf)
print("Accuracy of Random Forest: %.2f"%(np.mean(loans_full_rf)*100), "%")

Accuracy of all 5 Linear Regression models: [0.34114449 0.63965562 0.67435791 0.66025695 0.54421775]
Accuracy of Linear Regression: 57.19 %
Accuracy of all 5 Random Forest models: [0.1852108  0.2594334  0.26181715 0.23357963 0.22598916]
Accuracy of Random Forest: 23.32 %


#  Hyper-parameter optimization - Regression

In [19]:
#Regression
from sklearn.linear_model import Lasso, Ridge, ElasticNet

lm_lasso = Lasso()
lm_ridge =Ridge()
lm_elasticnet =ElasticNet()

linr_lasso=lm_lasso.fit(X_train, y_train)
print ("R-squared for Train: %.2f" %linr_lasso.score(X_train, y_train))
print ("R-squared for Test: %.2f" %linr_lasso.score(X_test, y_test))

linr_ridge=lm_ridge.fit(X_train, y_train)
print ("R-squared for Train: %.2f" %linr_ridge.score(X_train, y_train))
print ("R-squared for Test: %.2f" %linr_ridge.score(X_test, y_test))

linr_elasticnet=lm_elasticnet.fit(X_train, y_train)
print ("R-squared for Train: %.2f" %linr_elasticnet.score(X_train, y_train))
print ("R-squared for Test: %.2f" %linr_elasticnet.score(X_test, y_test))

predict_test_lasso = linr_lasso.predict(X_test)
predict_test_ridge = linr_ridge.predict(X_test)
predict_test_elasticnet = linr_elasticnet.predict(X_test)

print("Lasso Regression Mean Square Error for TEST Data is: ")
print(np.round(metrics.mean_squared_error(y_test, predict_test_lasso), 2))
print("Ridge Regression Mean Square Error for TEST Data is: ")
print(np.round(metrics.mean_squared_error(y_test, predict_test_ridge), 2))
print("Elastic Net Regression Mean Square Error for TEST Data is: ")
print(np.round(metrics.mean_squared_error(y_test, predict_test_elasticnet), 2))



R-squared for Train: 0.63
R-squared for Test: 0.62


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.994333e-17
  overwrite_a=True).T


R-squared for Train: 0.65
R-squared for Test: 0.65




R-squared for Train: 0.64
R-squared for Test: 0.63
Lasso Regression Mean Square Error for TEST Data is: 
7.18
Ridge Regression Mean Square Error for TEST Data is: 
6.63
Elastic Net Regression Mean Square Error for TEST Data is: 
7.02


#  Hyper-parameter optimization - Random Forest

In [None]:
rfc=RandomForestRegressor(random_state=42)
param_grid = { 
    'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

lab_enc = preprocessing.LabelEncoder()
encoded = lab_enc.fit_transform(y_train)
print(utils.multiclass.type_of_target(encoded))

CV_rfc.fit(X_train, encoded)
CV_rfc.best_params_


multiclass


In [None]:
CV_rfc.fit(X_train, encoded)
CV_rfc.best_params_