In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pylab as pl
import seaborn as sns
import statsmodels.api as sm

#%matplotlib inline
sns.set(style='white', font_scale=0.9)

data_directory = os.path.join('.', 'cleaned_data')

cleaned_accepted_data_path = os.path.join(data_directory, 'Cleaned_AcceptedLoanData.csv')
cleaned_rejected_data_path = os.path.join(data_directory, 'Cleaned_RejectedLoanData.csv')

In [2]:
# Logistic regression to predict whether a loan will be made or not.

# Need to keep only the common columns: Amount Requested, Risk_Score, DTI, Employment Length
accepted_df = pd.read_csv(cleaned_accepted_data_path, low_memory=False, encoding='UTF-8')
print(accepted_df.info())

# Include selected columns.
accepted_df = accepted_df[['loan_amnt', 'mean_fico', 'dti', 'emp_length']]

# Add a column to indicate that this loan was accepted.
accepted_df['accepted'] = int(1)
print(accepted_df.info())
accepted_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131893 entries, 0 to 131892
Data columns (total 34 columns):
Unnamed: 0                    131893 non-null int64
loan_amnt                     131893 non-null float64
term                          131893 non-null object
int_rate                      131893 non-null float64
emp_length                    131893 non-null float64
home_ownership                131893 non-null object
annual_inc                    131893 non-null float64
verification_status           131893 non-null object
issue_d                       131893 non-null object
purpose                       131893 non-null object
dti                           131893 non-null float64
delinq_2yrs                   131893 non-null float64
earliest_cr_line              131893 non-null int64
inq_last_6mths                131893 non-null float64
mths_since_last_delinq        131893 non-null float64
pub_rec                       131893 non-null float64
revol_bal                     1318

Unnamed: 0,loan_amnt,mean_fico,dti,emp_length,accepted
count,131893.0,131893.0,131893.0,131893.0,131893.0
mean,14770.019827,697.036977,18.242888,5.779071,1.0
std,8637.705924,30.746097,8.296326,3.724609,0.0
min,600.0,622.0,0.0,0.0,1.0
25%,8000.0,672.0,12.04,2.0,1.0
50%,13000.0,692.0,17.76,6.0,1.0
75%,20000.0,712.0,24.11,10.0,1.0
max,40000.0,847.5,46.52,10.0,1.0


In [3]:
# Amount Requested,Application Date,Loan Title,Risk_Score,Debt-To-Income Ratio,Zip Code,State,Employment Length,Policy Code,timestamp
rejected_df = pd.read_csv(cleaned_rejected_data_path, low_memory=False, encoding='UTF-8')
print(rejected_df.info())

# Include selected columns and rename them.
rejected_df = rejected_df[['loan_amnt', 'mean_fico', 'dti', 'emp_length']]

# Add a column to indicate that this loan was rejected.
rejected_df['accepted'] = int(0)

print(rejected_df.info())
rejected_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107939 entries, 0 to 1107938
Data columns (total 5 columns):
Unnamed: 0    1107939 non-null int64
loan_amnt     1107939 non-null float64
mean_fico     467802 non-null float64
dti           1107939 non-null float64
emp_length    1107939 non-null float64
dtypes: float64(4), int64(1)
memory usage: 42.3 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107939 entries, 0 to 1107938
Data columns (total 5 columns):
loan_amnt     1107939 non-null float64
mean_fico     467802 non-null float64
dti           1107939 non-null float64
emp_length    1107939 non-null float64
accepted      1107939 non-null int64
dtypes: float64(4), int64(1)
memory usage: 42.3 MB
None


Unnamed: 0,loan_amnt,mean_fico,dti,emp_length,accepted
count,1107939.0,467802.0,1107939.0,1107939.0,1107939.0
mean,13424.88,623.460058,195.6712,1.705757,0.0
std,16380.31,108.185247,9295.243,1.883706,0.0
min,0.0,0.0,-1.0,0.0,0.0
25%,4500.0,590.0,6.1,1.0,0.0
50%,10000.0,640.0,18.02,1.0,0.0
75%,20000.0,678.0,33.6,1.0,0.0
max,1400000.0,990.0,7130330.0,10.0,0.0


In [4]:
# Many of the rejected loans have no credit score.
rejected_df['mean_fico'].fillna(0.0, inplace=True)

print(rejected_df.info())
rejected_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107939 entries, 0 to 1107938
Data columns (total 5 columns):
loan_amnt     1107939 non-null float64
mean_fico     1107939 non-null float64
dti           1107939 non-null float64
emp_length    1107939 non-null float64
accepted      1107939 non-null int64
dtypes: float64(4), int64(1)
memory usage: 42.3 MB
None


Unnamed: 0,loan_amnt,mean_fico,dti,emp_length,accepted
count,1107939.0,1107939.0,1107939.0,1107939.0,1107939.0
mean,13424.88,263.2418,195.6712,1.705757,0.0
std,16380.31,315.8581,9295.243,1.883706,0.0
min,0.0,0.0,-1.0,0.0,0.0
25%,4500.0,0.0,6.1,1.0,0.0
50%,10000.0,0.0,18.02,1.0,0.0
75%,20000.0,624.0,33.6,1.0,0.0
max,1400000.0,990.0,7130330.0,10.0,0.0


In [7]:
# Split the accepted and rejected loan data: use first half for training and the second half for testing.

print(accepted_df.shape)
print(rejected_df.shape)

print(accepted_df.info())
print(rejected_df.info())

accepted_train_df, accepted_test_df = np.split(accepted_df.sample(frac=1), [int(0.5*len(accepted_df))])
print("accepted_train_df:")
print(accepted_train_df.shape)
print("accepted_test_df:")
print(accepted_test_df.shape)

rejected_train_df, rejected_test_df = np.split(rejected_df.sample(frac=1), [int(0.5*len(rejected_df))])
print("rejected_train_df:")
print(rejected_train_df.shape)
print("rejected_test_df:")
print(rejected_test_df.shape)

# Create the training dataframe.
train_df = pd.concat([accepted_train_df, rejected_train_df])
train_df.to_csv("train.csv", index=False)
# Shuffle the rows.
train_df = train_df.sample(frac=1).reset_index(drop=True)
print("train_df:")
print(train_df.shape)

# Create the test dataframe.
test_df = pd.concat([accepted_test_df, rejected_test_df])

# Shuffle the rows.
test_df = test_df.sample(frac=1).reset_index(drop=True)
test_df.to_csv("test.csv", index=False)
print("test_df:")
print(test_df.shape)

(131893, 5)
(1107939, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131893 entries, 0 to 131892
Data columns (total 5 columns):
loan_amnt     131893 non-null float64
mean_fico     131893 non-null float64
dti           131893 non-null float64
emp_length    131893 non-null float64
accepted      131893 non-null int64
dtypes: float64(4), int64(1)
memory usage: 5.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107939 entries, 0 to 1107938
Data columns (total 5 columns):
loan_amnt     1107939 non-null float64
mean_fico     1107939 non-null float64
dti           1107939 non-null float64
emp_length    1107939 non-null float64
accepted      1107939 non-null int64
dtypes: float64(4), int64(1)
memory usage: 42.3 MB
None
accepted_train_df:
(65946, 5)
accepted_test_df:
(65947, 5)
rejected_train_df:
(553969, 5)
rejected_test_df:
(553970, 5)
train_df:
(619915, 5)
test_df:
(619917, 5)


In [8]:
# Manually add the intercept
train_df['intercept'] = 1.0
test_df['intercept'] = 1.0

logit_model = sm.Logit(train_df['accepted'], train_df[['loan_amnt', 'mean_fico', 'dti', 'emp_length', 'intercept']])
result = logit_model.fit()

print(result.summary())

  return 1/(1+np.exp(-X))


Optimization terminated successfully.
         Current function value: 0.156435
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:               accepted   No. Observations:               619915
Model:                          Logit   Df Residuals:                   619910
Method:                           MLE   Df Model:                            4
Date:                Thu, 13 Apr 2017   Pseudo R-squ.:                  0.5384
Time:                        13:28:49   Log-Likelihood:                -96976.
converged:                       True   LL-Null:                   -2.1008e+05
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
loan_amnt  -1.495e-05   6.12e-07    -24.425      0.000     -1.62e-05 -1.38e-05
mean_fico      0.0187      0

In [9]:
test_predictions = result.predict(test_df[['loan_amnt', 'mean_fico', 'dti', 'emp_length', 'intercept']])
#print(test_predictions)

test_predictions_df = pd.DataFrame(test_predictions, columns=['predicted'])
test_predictions_df['actual'] = test_df[['accepted']]
test_predictions_df['mapped'] = round(test_predictions_df['predicted'], 0)
test_predictions_df.head(20)

  return 1/(1+np.exp(-X))


Unnamed: 0,predicted,actual,mapped
0,6.691107000000001e-106,0,0.0
1,5.318409e-07,0,0.0
2,0.07121803,0,0.0
3,9.383251e-07,0,0.0
4,0.03170916,0,0.0
5,1.728196e-06,0,0.0
6,4.67866e-07,0,0.0
7,8.741394e-07,0,0.0
8,0.7511548,1,1.0
9,0.127407,0,0.0


In [10]:
# Try and find the threshold value for determining whether a result is 'accepted' or 'declined'.
def find_threshold():
    result_list = []
    for t in range(30,50):
        wrong = 0
        threshold = round(t / 100, 2)
        print("testing with a threshold of " + str(threshold))
        for i, row in test_predictions_df.iterrows():
            value = 0
            if row['predicted'] > threshold:
                value = 1
    
            wrong += abs(value - row['actual'])
    
        entry = {'threshold':threshold, 'incorrect':wrong}
        result_list.append(entry)
    
    return result_list
    
#result_list = find_threshold()

threshold = 0.47

test_predictions_df['mapped'] = 0
test_predictions_df.loc[test_predictions_df['predicted'] > threshold, 'mapped'] = 1

print(test_predictions_df.head(10))

wrong = 0
for i, row in test_predictions_df.iterrows():
    wrong += abs(row['mapped'] - row['actual'])

print(wrong)
lr_accuracy = (test_predictions_df.shape[0] - wrong) / test_predictions_df.shape[0]
print("Logistic regression accuracy: " + str(lr_accuracy))

       predicted  actual  mapped
0  6.691107e-106       0       0
1   5.318409e-07       0       0
2   7.121803e-02       0       0
3   9.383251e-07       0       0
4   3.170916e-02       0       0
5   1.728196e-06       0       0
6   4.678660e-07       0       0
7   8.741394e-07       0       0
8   7.511548e-01       1       1
9   1.274070e-01       0       0
44078.0
Logistic regression accuracy: 0.928896932977


In [None]:
incorrect = [89555, 53530, 46164, 44274, 44314, 45012, 46256, 52652, 63028]
accuracy = list([round((620062 - x) / 620062, 4) for x in incorrect])
print(accuracy)

incorrect = [46164, 45759, 45436, 45217, 44958, 44723, 44558, 44494, 44405, 44299, 44274, 
             44250, 44182, 44214, 44186, 44210, 44211, 44171, 44217, 44236]
accuracy = list([round((620062 - x) / 620062, 4) for x in incorrect])
print(accuracy)

In [None]:
# Try a model where FICO is replaced by a boolean: has_fico
