# Data Analysis

Create Path to Import *.py files (not needed in "index.ipynb")

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

#### Import Libraries

In [2]:
import time
from codes.preprocess import read_csv
import pandas as pd
pd.options.display.max_columns = None

In [3]:
pd.options.display.max_rows = 100

Time to load "Acquisition" (sec):  4.5963
<br>Time to load "Performance" (sec):  78.4553

In [4]:
start_time = time.time()
acquisition_df = read_csv('Acquisition')
print(f'Time to load "Acquisition" {round(time.time()-start_time,4)} seconds')
start_time = time.time()
performance_df = read_csv('Performance')
print(f'Time to load "Performance" {round(time.time()-start_time,4)} seconds')

Time to load "Acquisition" 4.7261 seconds
Time to load "Performance" 72.0113 seconds


In [5]:
display(acquisition_df.head(5))
display(performance_df.head(5))

Unnamed: 0,id,channel,seller,interest_rate,balance,loan_term,origination_date,ltv,cltv,borrower_count,dti,borrower_score,first_time_homebuyer,loan_purpose,property_type,unit_count,occupancy_type,property_state,insurance_percentage,product_type,coborrower_score,insurance_type,relocation_flag
0,100001040173,R,QUICKEN LOANS INC.,4.25,453000,360,01/2018,65,65.0,1,28.0,791.0,N,C,PU,1,P,OH,,FRM,,,N
1,100002370993,C,"WELLS FARGO BANK, N.A.",4.25,266000,360,01/2018,80,80.0,2,41.0,736.0,N,R,PU,1,P,IN,,FRM,793.0,,N
2,100005405807,R,PMTT4,3.99,233000,360,12/2017,79,79.0,2,48.0,696.0,N,R,SF,1,P,CA,,FRM,665.0,,N
3,100008071646,R,OTHER,4.25,184000,360,01/2018,80,80.0,1,48.0,767.0,Y,P,PU,1,P,FL,,FRM,,,N
4,100010739040,R,OTHER,4.25,242000,360,02/2018,49,49.0,1,22.0,727.0,N,R,SF,1,P,CA,,FRM,,,N


Unnamed: 0,id,reporting_period,servicer_name,interest_rate,balance,loan_age,months_to_maturity,adj_months_to_maturity,maturity_date,msa,delinquency_status,modification_flag,zero_balance_code,zero_balance_date,last_paid_installment_date,foreclosure_date,disposition_date,foreclosure_costs,property_repair_costs,recovery_costs,misc_costs,sale_proceeds,repurchase_proceeds,make_whole_flag,foreclosure_writeoff,activity_flag
0,100001040173,02/01/2018,QUICKEN LOANS INC.,4.25,,0,360,360.0,02/2048,18140,0,N,,,,,,,,,,,,,,N
1,100001040173,03/01/2018,,4.25,,1,359,359.0,02/2048,18140,0,N,,,,,,,,,,,,,,N
2,100001040173,04/01/2018,,4.25,,2,358,358.0,02/2048,18140,0,N,,,,,,,,,,,,,,N
3,100001040173,05/01/2018,,4.25,,3,357,357.0,02/2048,18140,0,N,,,,,,,,,,,,,,N
4,100001040173,06/01/2018,,4.25,,4,356,356.0,02/2048,18140,0,N,,,,,,,,,,,,,,N


## SQLite DataBase
NOTE: Uncomment to creat a SQLite Database on computer

In [6]:
# import sqlite3
# conn = sqlite3.connect('Secondary_Mortgage_Loans.db')
# acquisition_df

In [7]:
# acquisition_df.to_sql('Acquisition', con=conn)
# performance_df.to_sql('Performance', con=conn)

## EDA

In [8]:
acquisition_df.info()
performance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1625195 entries, 0 to 1625194
Data columns (total 23 columns):
id                      1625195 non-null int64
channel                 1625195 non-null object
seller                  1625195 non-null object
interest_rate           1625195 non-null float64
balance                 1625195 non-null int64
loan_term               1625195 non-null int64
origination_date        1625195 non-null object
ltv                     1625195 non-null int64
cltv                    1625194 non-null float64
borrower_count          1625195 non-null int64
dti                     1624702 non-null float64
borrower_score          1623501 non-null float64
first_time_homebuyer    1625195 non-null object
loan_purpose            1625195 non-null object
property_type           1625195 non-null object
unit_count              1625195 non-null int64
occupancy_type          1625195 non-null object
property_state          1625195 non-null object
insurance_percentage    5

#### Filter rows outside of 50 states + DC. 
In both Acquisition and Performance.
- Use Acquisition's Loan ID to filter out rows in Performance.

In [9]:
acquisition_df.property_state.where(~acquisition_df.property_state.isin(['PR','GU','VI'])).isna().sum()

2298

Drop State Codes with 'PR','GU', and 'VI' from both Acquisition and Performance.

In [10]:
acquisition_df.property_state = acquisition_df.property_state.where(~acquisition_df.property_state.isin(['PR','GU','VI']))
acquisition_df = acquisition_df.dropna(subset=['property_state'])
acquisition_df.shape

(1622897, 23)

In [11]:
print('Before removing: ', performance_df.shape[0])
performance_df.id = performance_df.id.where(performance_df.id.isin(acquisition_df.id))
performance_df = performance_df.dropna(subset=['id'])
print('After removing: ', performance_df.shape[0])

Before removing:  24459263
After removing:  24423998


<p>behavior of payers up to point of deliquency
<br>(using balance)
</p>

In [12]:
print(performance_df.delinquency_status.isna().sum(), ' rows of NaN in "Delinquency Status"')
performance_df['delinquency_status'].value_counts()

199  rows of NaN in "Delinquency Status"


0     24022577
X       222454
1       131824
2        21958
3         8630
4         5127
5         3621
6         2367
7         1620
8         1173
9          797
10         593
11         405
12         270
13         164
14          96
15          60
16          35
17          17
18           7
19           4
Name: delinquency_status, dtype: int64

In [13]:
performance_df[performance_df['delinquency_status'] == '19'].id

4563857    5.986537e+11
4579447    6.004036e+11
5481370    6.991913e+11
7234078    8.907957e+11
Name: id, dtype: float64

In [14]:
performance_df[performance_df.id == 598653679058]

Unnamed: 0,id,reporting_period,servicer_name,interest_rate,balance,loan_age,months_to_maturity,adj_months_to_maturity,maturity_date,msa,delinquency_status,modification_flag,zero_balance_code,zero_balance_date,last_paid_installment_date,foreclosure_date,disposition_date,foreclosure_costs,property_repair_costs,recovery_costs,misc_costs,sale_proceeds,repurchase_proceeds,make_whole_flag,foreclosure_writeoff,activity_flag
4563838,598653700000.0,02/01/2018,OTHER,5.5,,1,359,359.0,01/2048,26420,0,N,,,,,,,,,,,,,,N
4563839,598653700000.0,03/01/2018,,5.5,,2,358,359.0,01/2048,26420,1,N,,,,,,,,,,,,,,N
4563840,598653700000.0,04/01/2018,,5.5,,3,357,359.0,01/2048,26420,2,N,,,,,,,,,,,,,,N
4563841,598653700000.0,05/01/2018,,5.5,,4,356,359.0,01/2048,26420,3,N,,,,,,,,,,,,,,N
4563842,598653700000.0,06/01/2018,,5.5,,5,355,359.0,01/2048,26420,4,N,,,,,,,,,,,,,,N
4563843,598653700000.0,07/01/2018,,5.5,,6,354,359.0,01/2048,26420,5,N,,,,,,,,,,,,,,N
4563844,598653700000.0,08/01/2018,,5.5,142344.02,7,353,359.0,01/2048,26420,6,N,,,,,,,,,,,,,,N
4563845,598653700000.0,09/01/2018,,5.5,142344.02,8,352,359.0,01/2048,26420,7,N,,,,,,,,,,,,,,N
4563846,598653700000.0,10/01/2018,,5.5,142344.02,9,351,359.0,01/2048,26420,8,N,,,,,,,,,,,,,,N
4563847,598653700000.0,11/01/2018,,5.5,142344.02,10,350,359.0,01/2048,26420,9,N,,,,,,,,,,,,,,N


In [15]:
performance_df[performance_df.id == 600403591410]

Unnamed: 0,id,reporting_period,servicer_name,interest_rate,balance,loan_age,months_to_maturity,adj_months_to_maturity,maturity_date,msa,delinquency_status,modification_flag,zero_balance_code,zero_balance_date,last_paid_installment_date,foreclosure_date,disposition_date,foreclosure_costs,property_repair_costs,recovery_costs,misc_costs,sale_proceeds,repurchase_proceeds,make_whole_flag,foreclosure_writeoff,activity_flag
4579427,600403600000.0,01/01/2018,OTHER,4.5,,0,360,360.0,01/2048,12940,0,N,,,,,,,,,,,,,,N
4579428,600403600000.0,02/01/2018,,4.5,,1,359,359.0,01/2048,12940,0,N,,,,,,,,,,,,,,N
4579429,600403600000.0,03/01/2018,,4.5,,2,358,359.0,01/2048,12940,1,N,,,,,,,,,,,,,,Y
4579430,600403600000.0,04/01/2018,,4.5,,3,357,359.0,01/2048,12940,2,N,,,,,,,,,,,,,,N
4579431,600403600000.0,05/01/2018,,4.5,,4,356,359.0,01/2048,12940,3,N,,,,,,,,,,,,,,N
4579432,600403600000.0,06/01/2018,,4.5,,5,355,359.0,01/2048,12940,4,N,,,,,,,,,,,,,,N
4579433,600403600000.0,07/01/2018,,4.5,135786.25,6,354,359.0,01/2048,12940,5,N,,,,,,,,,,,,,,N
4579434,600403600000.0,08/01/2018,,4.5,135786.25,7,353,359.0,01/2048,12940,6,N,,,,,,,,,,,,,,N
4579435,600403600000.0,09/01/2018,,4.5,135786.25,8,352,359.0,01/2048,12940,7,N,,,,,,,,,,,,,,N
4579436,600403600000.0,10/01/2018,,4.5,135786.25,9,351,359.0,01/2048,12940,8,N,,,,,,,,,,,,,,N


In [16]:
performance_df[performance_df['delinquency_status'] == '15'].id

185947      1.203418e+11
903514      1.990587e+11
1126450     2.230249e+11
1258907     2.375554e+11
1313689     2.434090e+11
1967223     3.144568e+11
2065952     3.252803e+11
2154642     3.349092e+11
2154643     3.349092e+11
2412369     3.630166e+11
2558821     3.787819e+11
2618849     3.852559e+11
2773824     4.021281e+11
2779797     4.027824e+11
2793243     4.042172e+11
3147380     4.434090e+11
3792345     5.138131e+11
4079628     5.456308e+11
4268798     5.664015e+11
4293255     5.692489e+11
4361883     5.767547e+11
4563853     5.986537e+11
4579443     6.004036e+11
4701406     6.134849e+11
4845505     6.291765e+11
5145889     6.622171e+11
5413793     6.918763e+11
5481366     6.991913e+11
5495910     7.008639e+11
5524039     7.040706e+11
5615377     7.141056e+11
5660227     7.191576e+11
6094242     7.664357e+11
6174040     7.752416e+11
6213608     7.796380e+11
6581822     8.199716e+11
6604522     8.223038e+11
7234074     8.907957e+11
7469093     9.163825e+11
7813233     9.543566e+11


In [17]:
performance_df[performance_df.id == 120341848961]

Unnamed: 0,id,reporting_period,servicer_name,interest_rate,balance,loan_age,months_to_maturity,adj_months_to_maturity,maturity_date,msa,delinquency_status,modification_flag,zero_balance_code,zero_balance_date,last_paid_installment_date,foreclosure_date,disposition_date,foreclosure_costs,property_repair_costs,recovery_costs,misc_costs,sale_proceeds,repurchase_proceeds,make_whole_flag,foreclosure_writeoff,activity_flag
185931,120341800000.0,02/01/2018,OTHER,5.25,,0,360,359.0,02/2048,19100,0,N,,,,,,,,,,,,,,N
185932,120341800000.0,03/01/2018,,5.25,,1,359,359.0,02/2048,19100,0,N,,,,,,,,,,,,,,N
185933,120341800000.0,04/01/2018,,5.25,,2,358,359.0,02/2048,19100,1,N,,,,,,,,,,,,,,N
185934,120341800000.0,05/01/2018,,5.25,,3,357,359.0,02/2048,19100,2,N,,,,,,,,,,,,,,Y
185935,120341800000.0,06/01/2018,,5.25,,4,356,359.0,02/2048,19100,3,N,,,,,,,,,,,,,,N
185936,120341800000.0,07/01/2018,,5.25,,5,355,359.0,02/2048,19100,4,N,,,,,,,,,,,,,,N
185937,120341800000.0,08/01/2018,,5.25,114868.1,6,354,359.0,02/2048,19100,5,N,,,,,,,,,,,,,,N
185938,120341800000.0,09/01/2018,,5.25,114868.1,7,353,359.0,02/2048,19100,6,N,,,,,,,,,,,,,,N
185939,120341800000.0,10/01/2018,,5.25,114868.1,8,352,359.0,02/2048,19100,7,N,,,,,,,,,,,,,,N
185940,120341800000.0,11/01/2018,,5.25,114868.1,9,351,359.0,02/2048,19100,8,N,,,,,,,,,,,,,,N


In [18]:
performance_df[performance_df.months_to_maturity < 300].months_to_maturity.value_counts()

177    165396
176    165115
175    164608
178    164183
174    163849
        ...  
46          2
45          2
44          1
43          1
42          1
Name: months_to_maturity, Length: 258, dtype: int64

In [19]:
performance_df['delinquency_bool'] = performance_df.delinquency_status.map(lambda x: False if x in ['0','X'] else True)

In [29]:
performance_df['delinquency_bool'].value_counts()

False    24245031
True       178967
Name: delinquency_bool, dtype: int64

In [32]:
delinquencies = performance_df[performance_df['delinquency_bool']]
delinquencies.id.value_counts()

3.572108e+11    21
5.697724e+11    20
8.907957e+11    20
6.991913e+11    19
6.004036e+11    19
                ..
2.288106e+11     1
4.232614e+11     1
5.888204e+11     1
6.747186e+11     1
7.816866e+11     1
Name: id, Length: 69827, dtype: int64