In [1]:
import requests
import numpy as np
import pandas as pd
import pandas_profiling
import json
import os
import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from scipy.stats import norm
from datetime import datetime

%matplotlib inline

pd.options.display.max_columns = None

import statsmodels.api as sm # Provides cross-sectional models and methods
from statsmodels.graphics.api import abline_plot # Plots line given intercept and slope
from sklearn.metrics import mean_squared_error, r2_score # Rates how close regression line is to data
from sklearn.model_selection import train_test_split #  Splits data into training and test groups
from sklearn import linear_model, preprocessing # Ordinary least square regression and standardizes the data
import warnings # For handling error messages.

In [2]:
os.chdir(r'C:\Users\2bogu\OneDrive\Desktop\Sringboard_Materials\capstone2\data\external')

df = pd.read_csv('Green_Jobs_-_Green_New_York__GJGNY__Residential_Loan_Portfolio___Beginning_November_2010.csv')

# Initial Cleaning

In [3]:
df.columns

Index(['Reporting Period', 'CONTRACT DATE', 'SETUP DATE',
       'ORIGINAL LOAN AMOUNT', 'ORIGINAL TERM', 'ORIGINAL MATURITY DATE',
       'CURRENT MATURITY DATE', 'INTEREST RATE', 'CREDIT SCORE CURRENT HIGH',
       'CREDIT SCORE', 'CREDIT SCORE NAME 2', 'DEBT TO INCOME',
       'FIRST PAYMENT DATE', 'LAST PAYMENT DATE', 'LAST PAYMENT AMOUNT',
       'NEXT PAYMENT DUE DATE', 'PAYMENT AMOUNT', 'CURRENT BALANCE',
       'UNAPPLIED CASH', 'DELINQUENT AMOUNT', 'TOTAL INTEREST PAID',
       'TOTAL LATE CHARGE PAID', 'PAYMENTS REMAINING', 'DAYS DELINQUENT',
       'PROPERTY COUNTY', 'UTILITY', 'INSTALLATION CITY', 'INSTALLATION ZIP',
       'Contractor Name', 'SUCCESSOR NUMBER', 'ACCOUNT CODE',
       'ACCOUNT CODE DATE', 'CANCEL REASON', 'TYPE OF BANKRUPTCY',
       'Months Since Origination', 'Payments Made', 'Purpose', 'Loan Type',
       'Underwriting', 'Pledged', 'Georeference'],
      dtype='object')

In [4]:
df = df.drop(['LAST PAYMENT AMOUNT', 'LAST PAYMENT DATE', 'FIRST PAYMENT DATE', 
              'DELINQUENT AMOUNT', 'DAYS DELINQUENT', 'Reporting Period', 
              'CREDIT SCORE CURRENT HIGH', 'NEXT PAYMENT DUE DATE', 'PAYMENTS REMAINING', 
              'PROPERTY COUNTY', 'UTILITY', 'INSTALLATION CITY', 'INSTALLATION ZIP', 
              'Contractor Name', 'Georeference', 'UNAPPLIED CASH', 'TOTAL LATE CHARGE PAID', 
              'CURRENT BALANCE', 'TOTAL INTEREST PAID', 'Payments Made'], axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27385 entries, 0 to 27384
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CONTRACT DATE             27385 non-null  object 
 1   SETUP DATE                27385 non-null  object 
 2   ORIGINAL LOAN AMOUNT      27385 non-null  float64
 3   ORIGINAL TERM             27385 non-null  int64  
 4   ORIGINAL MATURITY DATE    27385 non-null  object 
 5   CURRENT MATURITY DATE     27385 non-null  object 
 6   INTEREST RATE             27385 non-null  float64
 7   CREDIT SCORE              26863 non-null  float64
 8   CREDIT SCORE NAME 2       12149 non-null  float64
 9   DEBT TO INCOME            26938 non-null  float64
 10  PAYMENT AMOUNT            27385 non-null  float64
 11  SUCCESSOR NUMBER          770 non-null    object 
 12  ACCOUNT CODE              27385 non-null  object 
 13  ACCOUNT CODE DATE         27385 non-null  object 
 14  CANCEL

In [6]:
df['CANCEL REASON'].fillna('NONE', inplace = True)

df['TYPE OF BANKRUPTCY'].fillna('NONE', inplace = True)

df['SUCCESSOR NUMBER'].fillna('NONE', inplace = True)


In [7]:
# indicates if loan has a cosigner
df['co-signed'] = np.where(pd.notnull(df['CREDIT SCORE NAME 2']), 1, 0)

# averages credit score feature to get rid of nans in credit score 2
df['avg_credit_score'] = df[['CREDIT SCORE', 'CREDIT SCORE NAME 2']].mean(axis=1)

# fills remaining na values with average of averages
df['avg_credit_score'].fillna(df['avg_credit_score'].mean(), inplace=True)

df.drop(['CREDIT SCORE','CREDIT SCORE NAME 2'], axis=1, inplace=True)

In [8]:
df['ACCOUNT CODE'].value_counts()

SurePay/Automatic Debit Payment              12061
Check/Money Order Payment (with coupon)       9612
Paid in Full                                  4903
Legal Category                                 368
Notice of Bankruptcy (written)                 186
Canceled                                       160
Soft Hold                                       66
On Hold                                         16
Management Hold                                  5
Notice of Bankruptcy (verbal)                    4
Collections Terminated (by customer)             2
Dispute of Debt                                  1
SurePay/Automatic Debit Payment (on hold)        1
Name: ACCOUNT CODE, dtype: int64

In [9]:
# creates dependent var
#df['bad'] = df['CANCEL REASON'].str.contains('NONE')
df['bad'] = df['ACCOUNT CODE'].str.contains('Hold|Canceled|Terminated|Bankruptcy', regex=True)
df['bad'] = df['bad'].apply(lambda x: 1 if x==True else 0)

In [11]:
df.describe()

Unnamed: 0,ORIGINAL LOAN AMOUNT,ORIGINAL TERM,INTEREST RATE,DEBT TO INCOME,PAYMENT AMOUNT,Months Since Origination,co-signed,avg_credit_score,bad
count,27385.0,27385.0,27385.0,26938.0,27385.0,27385.0,27385.0,27385.0,27385.0
mean,11651.190126,168.875991,3.872746,0.335567,91.99346,51.724495,0.443637,734.659116,0.016031
std,6688.314789,30.736999,1.049811,2.678848,53.368043,25.555436,0.496822,53.601481,0.125596
min,61.0,1.0,2.99,0.0,0.0,0.0,0.0,512.0,0.0
25%,6095.0,180.0,3.49,0.19,49.0,36.0,0.0,700.0,0.0
50%,10473.0,180.0,3.49,0.31,83.0,51.0,0.0,744.5,0.0
75%,16167.0,180.0,3.49,0.43,127.0,66.0,1.0,779.0,0.0
max,25000.0,180.0,8.49,437.5,455.0,114.0,1.0,833.0,1.0


In [12]:
df['bad'].sum()

439

In [13]:
df['CANCEL REASON'].value_counts()

NONE                          27225
DEATH                            91
CUSTOMER DEFAULTED               25
SETTLEMENT                       16
CANCEL DUE TO BANKRUPTCY         13
INVOLUNTARY REPOSESSION          12
VOLUNTARY CANCEL                  1
VOLUNTARY REPOSESSION             1
DEVELOPER RELEASED ACCOUNT        1
Name: CANCEL REASON, dtype: int64

In [14]:
# cancelations due to death
91 / 439

0.2072892938496583

In [15]:
# no data collected on health
df = df[df['CANCEL REASON'] != 'DEATH']

In [16]:
df['bad'].sum()

348

In [17]:
df['SUCCESSOR NUMBER'].value_counts()

NONE    26527
S-1       334
S-0       319
S-2        53
s-1        29
S-3        19
S-4         4
s-2         2
S-6         2
S-5         2
s-3         2
s-0         1
Name: SUCCESSOR NUMBER, dtype: int64

In [18]:
dfb = df[df['bad']==1]

In [19]:
dfb['SUCCESSOR NUMBER'].value_counts()

NONE    314
S-1      17
S-0      10
S-2       4
S-3       1
S-4       1
s-0       1
Name: SUCCESSOR NUMBER, dtype: int64

In [20]:
dfb[dfb['SUCCESSOR NUMBER']!='NONE'].count()

CONTRACT DATE               34
SETUP DATE                  34
ORIGINAL LOAN AMOUNT        34
ORIGINAL TERM               34
ORIGINAL MATURITY DATE      34
CURRENT MATURITY DATE       34
INTEREST RATE               34
DEBT TO INCOME              11
PAYMENT AMOUNT              34
SUCCESSOR NUMBER            34
ACCOUNT CODE                34
ACCOUNT CODE DATE           34
CANCEL REASON               34
TYPE OF BANKRUPTCY          34
Months Since Origination    34
Purpose                     34
Loan Type                   34
Underwriting                34
Pledged                     34
co-signed                   34
avg_credit_score            34
bad                         34
dtype: int64

In [21]:
# proportion of original bad loans that are from successors
34/439

0.0774487471526196

In [22]:
# ony dealing with people who went through the application process
df = df[df['SUCCESSOR NUMBER']=='NONE']

df.drop('SUCCESSOR NUMBER',axis=1,inplace=True)

In [23]:
#all 'customer defaulted' loans were successors
df['CANCEL REASON'].value_counts()

NONE                        26491
SETTLEMENT                     13
CANCEL DUE TO BANKRUPTCY       13
INVOLUNTARY REPOSESSION         7
CUSTOMER DEFAULTED              2
VOLUNTARY REPOSESSION           1
Name: CANCEL REASON, dtype: int64

In [24]:
# maybe needed
df['CONTRACT DATE'] = pd.to_datetime(df['CONTRACT DATE'], format="%m/%d/%Y")
df['SETUP DATE'] = pd.to_datetime(df['SETUP DATE'], format="%m/%d/%Y")
df['ORIGINAL MATURITY DATE'] = pd.to_datetime(df['ORIGINAL MATURITY DATE'], format="%m/%d/%Y")
df['CURRENT MATURITY DATE'] = pd.to_datetime(df['CURRENT MATURITY DATE'], format="%m/%d/%Y")
df['ACCOUNT CODE DATE'] = pd.to_datetime(df['ACCOUNT CODE DATE'], format="%m/%d/%Y")

In [25]:
#df['s_y'] = df['SETUP DATE'].apply(lambda x: x.year)

In [26]:
#df[df['s_y']==2020]

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26527 entries, 0 to 27384
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   CONTRACT DATE             26527 non-null  datetime64[ns]
 1   SETUP DATE                26527 non-null  datetime64[ns]
 2   ORIGINAL LOAN AMOUNT      26527 non-null  float64       
 3   ORIGINAL TERM             26527 non-null  int64         
 4   ORIGINAL MATURITY DATE    26527 non-null  datetime64[ns]
 5   CURRENT MATURITY DATE     26527 non-null  datetime64[ns]
 6   INTEREST RATE             26527 non-null  float64       
 7   DEBT TO INCOME            26527 non-null  float64       
 8   PAYMENT AMOUNT            26527 non-null  float64       
 9   SUCCESSOR NUMBER          26527 non-null  object        
 10  ACCOUNT CODE              26527 non-null  object        
 11  ACCOUNT CODE DATE         26527 non-null  datetime64[ns]
 12  CANCEL REASON     

In [28]:
df.bad.sum()

314

In [29]:
df.to_csv(r'C:\Users\2bogu\OneDrive\Desktop\Sringboard_Materials\capstone2\data\interim\clean_al', index = False)