# Linear regression for predicting interest rates

This notebook uses all the accepted loans, i.e. no segmenting/clustering.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn import linear_model
from sklearn import model_selection
from sklearn import preprocessing

data_directory = os.path.join('.', 'cleaned_data') 

cleaned_accepted_data_path = os.path.join(data_directory, 'Cleaned_AcceptedLoanData.csv')

In [10]:
accepted_df = pd.read_csv(cleaned_accepted_data_path, low_memory=False, encoding='UTF-8')

# Temporarily sample until we have an idea how long this will take.
#accepted_df = accepted_df.sample(n=100000, random_state=4321, replace=False, axis=0)
#print(accepted_df.info())

# Remove non-numeric columns and those we don't care about.
accepted_df.drop(['Unnamed: 0', 'purpose', 'issue_d', 'fico_range', 'last_fico_range', 'timestamp'], 
                 axis=1, inplace=True)

# Replace the string 'term' with a number.
accepted_df['term'].replace(to_replace='[^0-9]+', value='', inplace=True, regex=True)
accepted_df['term'] = accepted_df['term'].astype(int)

# Split home ownership values into separate variables:
# home_mortgage, home_rent and home_own.
accepted_df['home_mortgage'] = 0
accepted_df['home_rent'] = 0
accepted_df['home_own'] = 0

accepted_df.loc[accepted_df['home_ownership']=='MORTGAGE', 'home_mortgage'] = 1
accepted_df.loc[accepted_df['home_ownership']=='RENT', 'home_rent'] = 1
accepted_df.loc[accepted_df['home_ownership']=='OWN', 'home_own'] = 1

# Drop home_ownership now.
accepted_df.drop(['home_ownership'], axis=1, inplace=True)

# Split application_type into separate variables: individual (0/1)
accepted_df['individual'] = 0
accepted_df.loc[accepted_df['application_type']=='INDIVIDUAL', 'individual'] = 1

# Drop application_type now.
accepted_df.drop(['application_type'], axis=1, inplace=True)

# Turn verification status into a numeric column.
accepted_df['income_verified'] = 1
accepted_df.loc[accepted_df['verification_status']=='Not Verified', 'income_verified'] = 0

# Drop verification status now.
accepted_df.drop(['verification_status', 'verification_status_joint'], axis=1, inplace=True)

# Drop the current credit score, since it wasn't available at the time the interest rate was set.
accepted_df.drop(['last_mean_fico'], axis=1, inplace=True)

print(accepted_df.info())
accepted_df.describe()
accepted_df.to_csv("accepted.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131893 entries, 0 to 131892
Data columns (total 28 columns):
loan_amnt                     131893 non-null float64
term                          131893 non-null int32
int_rate                      131893 non-null float64
emp_length                    131893 non-null float64
annual_inc                    131893 non-null float64
dti                           131893 non-null float64
delinq_2yrs                   131893 non-null float64
earliest_cr_line              131893 non-null int64
inq_last_6mths                131893 non-null float64
mths_since_last_delinq        131893 non-null float64
pub_rec                       131893 non-null float64
revol_bal                     131893 non-null float64
revol_util                    131893 non-null float64
total_acc                     131893 non-null float64
collections_12_mths_ex_med    131893 non-null float64
acc_open_past_24mths          131893 non-null float64
avg_cur_bal                  

In [3]:
# Split the loan data: use half for training and half for testing.
accepted_df_y = accepted_df[['int_rate']]
accepted_df_x = accepted_df.drop(['int_rate'], axis=1)

train_df_x, test_df_x, train_df_y, test_df_y = model_selection.train_test_split(
    accepted_df_x, accepted_df_y, test_size=0.5, random_state=42)

print("train_df_x:")
print(train_df_x.shape)

print("train_df_y:")
print(train_df_y.shape)

print("test_df_x:")
print(test_df_x.shape)

print("test_df_y:")
print(test_df_y.shape)

# Scale the x values.
train_df_x = preprocessing.scale(train_df_x)
test_df_x = preprocessing.scale(test_df_x)

train_df_x:
(65946, 27)
train_df_y:
(65946, 1)
test_df_x:
(65947, 27)
test_df_y:
(65947, 1)


In [9]:
# Create linear regression object.
lm = linear_model.LinearRegression()

# Train the model.
lm.fit(train_df_x, train_df_y)

# Print the coefficients.
print('Coefficients: \n', lm.coef_)

# Calculate the mean squared error.
print("Mean squared error: %.2f" % np.mean((lm.predict(test_df_x) - test_df_y) ** 2))
print("Root mean squared error: %.2f" % (np.mean((lm.predict(test_df_x) - test_df_y) ** 2))**0.5)

# Print the explained variance score (1 is perfect prediction).
print('Variance score: %.2f' % lm.score(test_df_x, test_df_y))

print("Mean Absolute Percent Error: %.2f" % (np.mean(np.abs((lm.predict(test_df_x) - test_df_y) / lm.predict(test_df_x))) * 100))

Coefficients: 
 [[ 0.46728469  1.70537976 -0.02228413 -0.19746662  0.52047471  0.08683277
  -0.19135679  0.83562216 -0.09196297 -0.05546849 -0.24220846  0.3096084
  -0.70188735 -0.00500134  0.81432747 -0.08992683 -0.40736942 -0.02532263
   0.01828071  0.01658771  0.03755726 -1.20824699 -0.21832669  0.00521397
   0.02148635 -0.02096725  0.39450338]]
Mean squared error: 11.62
Root mean squared error: 3.41
Variance score: 0.45
Mean Absolute Percent Error: 21.43
