In [88]:
import requests
import numpy as np
import pandas as pd
import pandas_profiling
import json
import os
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from scipy.stats import norm
from datetime import datetime

%matplotlib inline

pd.options.display.max_columns = None

import statsmodels.api as sm # Provides cross-sectional models and methods
from statsmodels.graphics.api import abline_plot # Plots line given intercept and slope
from sklearn.metrics import mean_squared_error, r2_score # Rates how close regression line is to data
from sklearn.model_selection import train_test_split #  Splits data into training and test groups
from sklearn import linear_model, preprocessing # Ordinary least square regression and standardizes the data
import warnings # For handling error messages.

In [89]:
os.chdir(r'C:\Users\2bogu\OneDrive\Desktop\Sringboard_Materials\capstone2\data\external')

df = pd.read_csv('Green_Jobs_-_Green_New_York__GJGNY__Residential_Loan_Portfolio___Beginning_November_2010.csv')

# Initial Cleaning

In [90]:
df.columns

Index(['Reporting Period', 'CONTRACT DATE', 'SETUP DATE',
       'ORIGINAL LOAN AMOUNT', 'ORIGINAL TERM', 'ORIGINAL MATURITY DATE',
       'CURRENT MATURITY DATE', 'INTEREST RATE', 'CREDIT SCORE CURRENT HIGH',
       'CREDIT SCORE', 'CREDIT SCORE NAME 2', 'DEBT TO INCOME',
       'FIRST PAYMENT DATE', 'LAST PAYMENT DATE', 'LAST PAYMENT AMOUNT',
       'NEXT PAYMENT DUE DATE', 'PAYMENT AMOUNT', 'CURRENT BALANCE',
       'UNAPPLIED CASH', 'DELINQUENT AMOUNT', 'TOTAL INTEREST PAID',
       'TOTAL LATE CHARGE PAID', 'PAYMENTS REMAINING', 'DAYS DELINQUENT',
       'PROPERTY COUNTY', 'UTILITY', 'INSTALLATION CITY', 'INSTALLATION ZIP',
       'Contractor Name', 'SUCCESSOR NUMBER', 'ACCOUNT CODE',
       'ACCOUNT CODE DATE', 'CANCEL REASON', 'TYPE OF BANKRUPTCY',
       'Months Since Origination', 'Payments Made', 'Purpose', 'Loan Type',
       'Underwriting', 'Pledged', 'Georeference'],
      dtype='object')

In [91]:
df = df.drop(['LAST PAYMENT AMOUNT', 'LAST PAYMENT DATE', 'FIRST PAYMENT DATE', 
              'DELINQUENT AMOUNT', 'DAYS DELINQUENT', 'Reporting Period', 
              'CREDIT SCORE CURRENT HIGH', 'NEXT PAYMENT DUE DATE', 'PAYMENTS REMAINING', 
              'PROPERTY COUNTY', 'UTILITY', 'INSTALLATION CITY', 'INSTALLATION ZIP', 
              'Contractor Name', 'Georeference', 'UNAPPLIED CASH', 'TOTAL LATE CHARGE PAID', 
              'CURRENT BALANCE', 'TOTAL INTEREST PAID', 'Payments Made'], axis=1)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27385 entries, 0 to 27384
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CONTRACT DATE             27385 non-null  object 
 1   SETUP DATE                27385 non-null  object 
 2   ORIGINAL LOAN AMOUNT      27385 non-null  float64
 3   ORIGINAL TERM             27385 non-null  int64  
 4   ORIGINAL MATURITY DATE    27385 non-null  object 
 5   CURRENT MATURITY DATE     27385 non-null  object 
 6   INTEREST RATE             27385 non-null  float64
 7   CREDIT SCORE              26863 non-null  float64
 8   CREDIT SCORE NAME 2       12149 non-null  float64
 9   DEBT TO INCOME            26938 non-null  float64
 10  PAYMENT AMOUNT            27385 non-null  float64
 11  SUCCESSOR NUMBER          770 non-null    object 
 12  ACCOUNT CODE              27385 non-null  object 
 13  ACCOUNT CODE DATE         27385 non-null  object 
 14  CANCEL

In [93]:
df['CANCEL REASON'].fillna('NONE', inplace = True)

df['TYPE OF BANKRUPTCY'].fillna('NONE', inplace = True)

df['SUCCESSOR NUMBER'].fillna('NONE', inplace = True)


In [94]:
# indicates if loan has a cosigner
df['co-signed'] = np.where(pd.notnull(df['CREDIT SCORE NAME 2']), 1, 0)

# averages credit score feature to get rid of nans in credit score 2
df['avg_credit_score'] = df[['CREDIT SCORE', 'CREDIT SCORE NAME 2']].mean(axis=1)

# fills remaining na values with average of averages
df['avg_credit_score'].fillna(df['avg_credit_score'].mean(), inplace=True)

df.drop(['CREDIT SCORE','CREDIT SCORE NAME 2'], axis=1, inplace=True)

In [95]:
df['ACCOUNT CODE'].value_counts()

SurePay/Automatic Debit Payment              12061
Check/Money Order Payment (with coupon)       9612
Paid in Full                                  4903
Legal Category                                 368
Notice of Bankruptcy (written)                 186
Canceled                                       160
Soft Hold                                       66
On Hold                                         16
Management Hold                                  5
Notice of Bankruptcy (verbal)                    4
Collections Terminated (by customer)             2
Dispute of Debt                                  1
SurePay/Automatic Debit Payment (on hold)        1
Name: ACCOUNT CODE, dtype: int64

In [96]:
# FILTERING FOR FINISHED LOANS
df = df.loc[df['ACCOUNT CODE'].str.contains('Hold|Canceled|Terminated|Bankruptcy|Full', regex=True)]

In [97]:
# creates dependent var
#df['bad'] = df['CANCEL REASON'].str.contains('NONE')
df['bad'] = df['ACCOUNT CODE'].str.contains('Hold|Canceled|Terminated|Bankruptcy', regex=True)
df['bad'] = df['bad'].apply(lambda x: 1 if x==True else 0)

In [98]:
df.describe()

Unnamed: 0,ORIGINAL LOAN AMOUNT,ORIGINAL TERM,INTEREST RATE,DEBT TO INCOME,PAYMENT AMOUNT,Months Since Origination,co-signed,avg_credit_score,bad
count,5342.0,5342.0,5342.0,5216.0,5342.0,5342.0,5342.0,5342.0,5342.0
mean,10076.636887,150.875515,3.769296,0.297223,91.328793,65.619993,0.415013,750.423501,0.082179
std,6337.052334,47.812924,0.902968,0.525935,59.209793,23.999321,0.49277,48.068659,0.274663
min,61.0,26.0,2.99,0.0,11.0,2.0,0.0,527.0,0.0
25%,5020.25,120.0,3.49,0.16,48.0,49.0,0.0,724.0,0.0
50%,8386.0,180.0,3.49,0.28,78.0,63.0,0.0,763.5,0.0
75%,13431.5,180.0,3.49,0.4,123.0,85.0,1.0,788.0,0.0
max,25000.0,180.0,8.49,36.09,455.0,114.0,1.0,833.0,1.0


In [99]:
# removing outleir
df = df[df['DEBT TO INCOME'] < 30]

In [100]:
df.describe()

Unnamed: 0,ORIGINAL LOAN AMOUNT,ORIGINAL TERM,INTEREST RATE,DEBT TO INCOME,PAYMENT AMOUNT,Months Since Origination,co-signed,avg_credit_score,bad
count,5215.0,5215.0,5215.0,5215.0,5215.0,5215.0,5215.0,5215.0,5215.0
mean,10156.118696,150.841035,3.779453,0.290359,91.858688,65.46767,0.42512,750.800824,0.07977
std,6315.746301,48.268816,0.910801,0.175798,59.469249,24.126659,0.494409,48.586646,0.270963
min,1472.0,60.0,2.99,0.0,11.0,2.0,0.0,527.0,0.0
25%,5076.5,120.0,3.49,0.16,48.0,49.0,0.0,723.0,0.0
50%,8510.0,180.0,3.49,0.28,79.0,63.0,0.0,765.0,0.0
75%,13535.5,180.0,3.49,0.4,124.0,85.0,1.0,788.5,0.0
max,25000.0,180.0,8.49,1.0,455.0,114.0,1.0,833.0,1.0


In [101]:
df['bad'].sum()

416

In [102]:
df['CANCEL REASON'].value_counts()

NONE                        5077
DEATH                         91
SETTLEMENT                    14
CANCEL DUE TO BANKRUPTCY      13
CUSTOMER DEFAULTED            12
INVOLUNTARY REPOSESSION        7
VOLUNTARY REPOSESSION          1
Name: CANCEL REASON, dtype: int64

In [103]:
# cancelations due to death
91 / 439

0.2072892938496583

In [104]:
# no data collected on health
df = df[df['CANCEL REASON'] != 'DEATH']

In [105]:
df['bad'].sum()

325

In [106]:
df['SUCCESSOR NUMBER'].value_counts()

NONE    4813
S-0      310
s-0        1
Name: SUCCESSOR NUMBER, dtype: int64

In [107]:
dfb = df[df['bad']==1]

In [108]:
dfb['SUCCESSOR NUMBER'].value_counts()

NONE    314
S-0      10
s-0       1
Name: SUCCESSOR NUMBER, dtype: int64

In [109]:
dfb[dfb['SUCCESSOR NUMBER']!='NONE'].count()

CONTRACT DATE               11
SETUP DATE                  11
ORIGINAL LOAN AMOUNT        11
ORIGINAL TERM               11
ORIGINAL MATURITY DATE      11
CURRENT MATURITY DATE       11
INTEREST RATE               11
DEBT TO INCOME              11
PAYMENT AMOUNT              11
SUCCESSOR NUMBER            11
ACCOUNT CODE                11
ACCOUNT CODE DATE           11
CANCEL REASON               11
TYPE OF BANKRUPTCY          11
Months Since Origination    11
Purpose                     11
Loan Type                   11
Underwriting                11
Pledged                     11
co-signed                   11
avg_credit_score            11
bad                         11
dtype: int64

In [110]:
# proportion of original bad loans that are from successors
34/439

0.0774487471526196

In [111]:
# ony dealing with people who went through the application process
df = df[df['SUCCESSOR NUMBER']=='NONE']

df.drop('SUCCESSOR NUMBER', axis=1,inplace=True)

In [112]:
#all 'customer defaulted' loans were successors
df['CANCEL REASON'].value_counts()

NONE                        4777
CANCEL DUE TO BANKRUPTCY      13
SETTLEMENT                    13
INVOLUNTARY REPOSESSION        7
CUSTOMER DEFAULTED             2
VOLUNTARY REPOSESSION          1
Name: CANCEL REASON, dtype: int64

In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4813 entries, 0 to 27357
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CONTRACT DATE             4813 non-null   object 
 1   SETUP DATE                4813 non-null   object 
 2   ORIGINAL LOAN AMOUNT      4813 non-null   float64
 3   ORIGINAL TERM             4813 non-null   int64  
 4   ORIGINAL MATURITY DATE    4813 non-null   object 
 5   CURRENT MATURITY DATE     4813 non-null   object 
 6   INTEREST RATE             4813 non-null   float64
 7   DEBT TO INCOME            4813 non-null   float64
 8   PAYMENT AMOUNT            4813 non-null   float64
 9   ACCOUNT CODE              4813 non-null   object 
 10  ACCOUNT CODE DATE         4813 non-null   object 
 11  CANCEL REASON             4813 non-null   object 
 12  TYPE OF BANKRUPTCY        4813 non-null   object 
 13  Months Since Origination  4813 non-null   float64
 14  Purpose

In [114]:
df.bad.sum()

314

In [115]:
df.to_csv(r'C:\Users\2bogu\OneDrive\Desktop\Sringboard_Materials\capstone2\data\interim\clean_fl', index = False)