# DS-GA 1001 Project
## Training Set and Feature Engineering (Part 2)

This script creates training sample for Project Part 2 (Please refer to readme for details)
***

## 1. Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics 
%matplotlib inline
from sklearn.model_selection import train_test_split

In [2]:
Data = pd.read_csv("mortgage.csv")
Data.head()

Unnamed: 0,id,time,orig_time,first_time,mat_time,balance_time,LTV_time,interest_rate_time,hpi_time,gdp_time,...,REtype_SF_orig_time,investor_orig_time,balance_orig_time,FICO_orig_time,LTV_orig_time,Interest_Rate_orig_time,hpi_orig_time,default_time,payoff_time,status_time
0,1,25,-7,25,113,41303.42,24.498336,9.2,226.29,2.899137,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
1,1,26,-7,25,113,41061.95,24.483867,9.2,225.1,2.151365,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
2,1,27,-7,25,113,40804.42,24.626795,9.2,222.39,2.361722,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
3,1,28,-7,25,113,40483.89,24.735883,9.2,219.67,1.229172,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0
4,1,29,-7,25,113,40367.06,24.925476,9.2,217.37,1.692969,...,1,0,45000.0,715,69.4,9.2,87.03,0,0,0


### Remove Duplicate

In [15]:
Data.drop_duplicates(keep = False, inplace = True)

## 2. Fill NA Values

There are two variables containing missing values: LTV_time and Interest_Rate_Orig_Time. <br>
Since both variables are highly correlated with other variables, we fill in NA from an estimation of polynomial regression from other existing variables, with "LinearRegression" in sklearn package. We will not adopt all variables in feature engineering in later steps, so our filling NA method won't cause multicolinearity issue in the main model training. 

### 2.1 Fill NA for LTV_time

In [5]:
Data['LTV_time'][Data['LTV_time']==0] = None

ltv_sample = Data[~Data['LTV_time'].isnull()]
corr = ltv_sample.corr()
features = ltv_sample.columns[abs(corr.loc['LTV_time',:]) > 0.3]
ltv_sample = ltv_sample[features]
ltv_result_x = Data[features][Data['LTV_time'].isnull()].drop(['LTV_time'], axis = 1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [6]:
train, test = train_test_split(ltv_sample, test_size=0.2)
X_train, X_test = train.drop(['LTV_time'], axis = 1), test.drop(['LTV_time'], axis = 1)
y_train, y_test = train['LTV_time'], test['LTV_time']

In [7]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# score = {}
# for degree in [1,2,3]:
#     model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
#     model.fit(X_train,y_train)
#     score[degree] = model.score(X_test, y_test)
# display(score)

In [8]:
from sklearn.metrics import mean_squared_error, r2_score
model = make_pipeline(PolynomialFeatures(3), LinearRegression())
model.fit(X_train,y_train)
# Make predictions using the testing set
y_pred = model.predict(X_test)

# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 66.25
Variance score: 0.89


In [9]:
ltv_result_y = model.predict(ltv_result_x)
Data.loc[Data['LTV_time'].isnull(), 'LTV_time'] = ltv_result_y

### 2.2 Fill NA for Interest_Rate_orig_time

In [11]:
Data['Interest_Rate_orig_time'][Data['Interest_Rate_orig_time']==0] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
temp = Data['id'][Data['Interest_Rate_orig_time'].isnull()].value_counts()
fill_val = {}
for i in temp.index:
  fill_val[Data[Data['id']==i].index[0]] = Data[Data['id'] == i]['interest_rate_time'].iloc[0]

In [14]:
Data['Interest_Rate_orig_time'] = Data['Interest_Rate_orig_time'].fillna(fill_val)
Data.fillna(method = 'ffill', inplace=True)

## 3. Feature Engineering Functions (for time-series features)

### 3.1 Get monthly payment history for certain borrower AT A GIVEN TIME (earlest period until given time)

Function: **MonthlyPaymentTime(ID, timeGiven)**
- Input: ID, timeGiven (timeGiven is the casting time point)
- Return: a list containing monthly payment for all available months
- Method: detect starting and ending (first and last) available observation time, calculate difference between two remaining balances and store into list. 

In [23]:
def MonthlyPaymentTime(ID, timeGiven):
    Payment = []
    
    timeStart = Data.groupby(['id'])['time'].min()
    timeStart = timeStart.to_frame()
    
    for time in range(timeStart.loc[ID].values[0], timeGiven):
        condition1 = (time+1 <= timeGiven)
        condition2 = (time in set(Data[Data['id']==ID]['time']))
        condition3 = (time+1 in set(Data[Data['id']==ID]['time']))
        
        if condition1 and condition2 and condition3:
            Balance1 = Data[Data['id']==ID][Data['time']==time]['balance_time'].values[0]
            Balance2 = Data[Data['id']==ID][Data['time']==(time+1)]['balance_time'].values[0]
            monthlyPay = Balance1 - Balance2
            Payment = Payment + [monthlyPay]
        else:
            continue
    return Payment

In [24]:
MonthlyPaymentTime(4,35)

  del sys.path[0]
  


[148.6800000000003,
 152.61999999999534,
 153.6600000000035,
 151.75,
 155.90999999999622,
 160.20000000000437,
 163.5699999999997,
 166.02999999999884,
 174.91999999999825,
 188.29000000000087]

### 3.2 Mean, Std, Min, Max for total payment history or UP TO GIVEN TIME

A combined function for all features above: **CombinedMonthlyPaymentTime(ID, time)**
- Input: Borrower ID, time at casting point
- Return: A list of following values:
    - avg: average payment per month
    - maxi: maximum payment
    - mini: minimum payment
    - std: standard deviation of peyment
    - record0: number of 0 amount payment throughout history
    - recordLow: number of payments lower than mean - 1*std in history
- Method: 
    - Call MonthlyPaymentTime to get a series for history
    - Calculate avg, maxi, mini, std from built-in functions
    - record0 and recordLow are counted from for loop

In [26]:
# This is a combined function, to improve efficiency

def CombinedMonthlyPaymentTime(ID, time):
    sr = MonthlyPaymentTime(ID, time)
    
    if len(sr)>=1:
        avg = statistics.mean(sr)
        maxi = max(sr)
        mini = min(sr)
        if len(sr)>=2:
            std = statistics.stdev(sr)
        else:
            std = 0
    else:
        avg = 0
        maxi = 0
        mini = 0
        std = 0
    
    record0 = 0
    recordLow = 0
    for i in range(len(sr)):
        if sr[i] <= 0:
            record0 += 1
        if sr[i] <= avg-std:
            recordLow += 1
    
    return (avg, std, maxi, mini, record0, recordLow)

## 4. Create Training Set (by borrower * time)

**List of features**
(Bold terms are those different from Part1)
- mat_time: maturity time for loan (as original)
- balance_orig: initial balance, from balance_orig_time
- **balance_time: remaining balance at observing period (casting point)**
- **LTV_time: last LTV_time at casting point**
- LTV_orig: same as LTV_orig_time
- **interest_time: last interest_rate_time at casting point**
- interest_orig: same as Interest_Rate_orig_time
- **hpi_time: last hpi_time at casting point**
- hpi_orig: same as hpi_orig_time
- **gdp_time: last gdp_time at casting point**
- **uer_time: last uer_time at casting point**
- REtype_CO, REtype_PU, REtype_SF: as original
- investor, FICO: same as _orig_time

**Created features**
- **payment_hist: total length of history up to casting point, time - orig_time**
- avg_payment: average amount of payment every month (up to casting point)
- std_payment: std of payment every month (up to casting point)
- max_payment: max of payment every month (up to casting point)
- min_payment: min of payment every month (up to casting point)
- count_zero_payment: number of 0 payments (up to casting point)
- count_low_payment: number of payments < mean - 1*std (up to casting point)

### 4.1 Casting Sample Function

**This is a try-out case to explain the logic for CastSample function**
1. Get all default result from casting time (24) up to time = 26 (windowLength=2)
2. transfer all default result to pivot table
3. sum columns of pivot table, to get default result UP TO time=26 (instead of exactly at time 26)
4. Join result with casting period features

In [27]:
filterData = Data[(Data['time']>=25) & (Data['time']<=26)][['id','time','default_time']]

table_default = pd.pivot_table(filterData, values='default_time', 
                       index=['id'],columns=['time'], fill_value=0)

resultY = table_default[table_default.columns].sum(axis=1)
resultY = pd.DataFrame(resultY, columns = ['default'])
Xtry = Data[Data['time'] == 24][['id','time']]
final = Xtry.join(resultY, on = 'id', how='inner')
final.head()

Unnamed: 0,id,time,default
2222,209,24,0
2225,210,24,0
2255,211,24,0
2265,212,24,0
2273,213,24,0


Main Function **CastSample(castTime, windowLength, featureList):**
- Input: 
    - castTime: casting time period for observation. All features are taken up to this point
    - windowLength: number of periods for prediction, the outcome on castTime+windowLength will be target label
    - featureList: list of features to extract
- Return: 
    - A dataframe that contains features up to castTime, and outcome within windowLength (from castTime to castTime + windowLength)
- Method: 
    - See try out case above

In [29]:
# Input: casting time period, windowLength for prediction, feature list

def CastSample(castTime, windowLength, featureList):
    X = Data[Data['time'] == castTime][['id']+featureList]
    
    windowData = Data[(Data['time']>castTime) & (Data['time']<=castTime+windowLength)]
    filterData = windowData[['id','time','default_time','payoff_time','status_time']]
    
    default = pd.pivot_table(filterData, values='default_time', index=['id'],columns=['time'], fill_value=0)
    payoff = pd.pivot_table(filterData, values='payoff_time', index=['id'],columns=['time'], fill_value=0)
    status = pd.pivot_table(filterData, values='status_time', index=['id'],columns=['time'], fill_value=0)
    
    defaultResult = default[default.columns].sum(axis=1)
    payoffResult = payoff[payoff.columns].sum(axis=1)
    statusResult = status[status.columns].sum(axis=1)
    defaultResult = pd.DataFrame(defaultResult, columns = ['default'])
    payoffResult = pd.DataFrame(payoffResult, columns = ['payoff'])
    statusResult = pd.DataFrame(statusResult, columns = ['status'])
    
    Sample = X.join(defaultResult, on='id', how='inner').join(payoffResult,on='id').join(statusResult,on ='id')
    
    return Sample

In [30]:
#A try out
featureListTry = ['time','balance_time']
CastSample(24,2,featureListTry).head()

Unnamed: 0,id,time,balance_time,default,payoff,status
2222,209,24,305400.0,0,1,2
2225,210,24,139200.0,0,0,0
2255,211,24,161500.0,0,0,0
2265,212,24,220000.0,0,0,0
2273,213,24,89388.0,0,0,0


### 4.2 Create DataSet with Non-Time-Series Features

Because different window length will by nature generate different base rate for default (target), we create different sizes of samples for different window lengths. In later steps (scale_split.ipynb) we will do up sampling for each window length, in order to keep a relatively consistent base rate. More detailed explanations can be seen in report.

Casting time period: 
- Window Length 1: (3, 8, 13, 18, 23, 28, 33, 38, 43, 48, 53, 58)
- Window Length 3: (3, 13, 18, 28, 33, 43, 48)
- Window Length 6: (3, 13, 18, 33, 43, 48)
- Window Length 12: (3, 18, 33, 48)

In [31]:
NonTimefeature = ['mat_time', 'balance_orig_time', 'balance_time','LTV_orig_time', 
       'LTV_time', 'interest_rate_time', 'Interest_Rate_orig_time', 'hpi_time', 'hpi_orig_time', 
       'gdp_time', 'uer_time',
       'REtype_CO_orig_time', 'REtype_PU_orig_time', 'REtype_SF_orig_time',
       'investor_orig_time', 'FICO_orig_time'] + ['time','orig_time'] 
#time & orig_time column helps for later feature engineering, eventually we'll drop it

Time3Win1 = CastSample(3, 1, NonTimefeature)
Time3Win3 = CastSample(3, 3, NonTimefeature)
Time3Win6 = CastSample(3, 6, NonTimefeature)
Time3Win12 = CastSample(3, 12, NonTimefeature)

Time18Win1 = CastSample(18, 1, NonTimefeature)
Time18Win3 = CastSample(18, 3, NonTimefeature)
Time18Win6 = CastSample(18, 6, NonTimefeature)
Time18Win12 = CastSample(18, 12, NonTimefeature)

Time33Win1 = CastSample(33, 1, NonTimefeature)
Time33Win3 = CastSample(33, 3, NonTimefeature)
Time33Win6 = CastSample(33, 6, NonTimefeature)
Time33Win12 = CastSample(33, 12, NonTimefeature)

Time48Win1 = CastSample(48, 1, NonTimefeature)
Time48Win3 = CastSample(48, 3, NonTimefeature)
Time48Win6 = CastSample(48, 6, NonTimefeature)
Time48Win12 = CastSample(48, 12, NonTimefeature)

Time8Win1 = CastSample(8, 1, NonTimefeature)
Time13Win1 = CastSample(13, 1, NonTimefeature)
Time23Win1 = CastSample(23, 1, NonTimefeature)
Time28Win1 = CastSample(28, 1, NonTimefeature)
Time38Win1 = CastSample(38, 1, NonTimefeature)
Time43Win1 = CastSample(43, 1, NonTimefeature)
Time53Win1 = CastSample(53, 1, NonTimefeature)
Time58Win1 = CastSample(58, 1, NonTimefeature)

Time13Win3 = CastSample(13, 3, NonTimefeature)
Time28Win3 = CastSample(28, 3, NonTimefeature)
Time43Win3 = CastSample(43, 3, NonTimefeature)

Time13Win6 = CastSample(13, 6, NonTimefeature)
Time43Win6 = CastSample(43, 6, NonTimefeature)

### 4.3 Fill in Time-Series Features

In [130]:
# Payment History Length
sampleList = [Time3Win1, Time3Win3, Time3Win6, Time3Win12,
              Time18Win1, Time18Win3, Time18Win6, Time18Win12,
              Time33Win1, Time33Win3, Time33Win6, Time33Win12,
              Time48Win1, Time48Win3, Time48Win6, Time48Win12,
              Time8Win1, Time13Win1, Time23Win1, Time28Win1, 
              Time38Win1, Time43Win1, Time53Win1, Time58Win1, 
              Time13Win3, Time28Win3, Time43Win3, 
              Time13Win6, Time43Win6]

for data in sampleList:
    data['payment_hist'] = data['time'] - data['orig_time']

In [32]:
# Everything else in the list

def Fill_Time_Features(data, castTime):
    avg_payment = []
    std_payment = []
    max_payment = []
    min_payment = []
    count_zero_payment = []
    count_low_payment = []
    IDcount = 0
    
    for ID in list(data['id']):
        v1, v2, v3, v4, v5, v6 = CombinedMonthlyPaymentTime(ID, castTime)
        avg_payment.append(v1)
        std_payment.append(v2)
        max_payment.append(v3)
        min_payment.append(v4)
        count_zero_payment.append(v5)
        count_low_payment.append(v6)
        IDcount +=1
        
        # The print line is only for helping track process
        print(IDcount)
        
    data['avg_payment'] = avg_payment
    data['std_payment'] = std_payment
    data['max_payment'] = max_payment
    data['min_payment'] = min_payment
    data['count_zero_payment'] = count_zero_payment
    data['count_low_payment'] = count_low_payment

Then we ran function for all dataframes listed above, the process is skipped since it's too long <br>
To fill time-series features for any dataset, can just call the following format: <br>
- Fill_Time_Features(TimeXWinY, X)

### 4.4 Final concatenation with same window length, drop time & orig_time, export dataset

In [168]:
Part2_Win1 = pd.concat([Time3Win1, Time18Win1, Time33Win1, Time48Win1,
                        Time8Win1, Time13Win1, Time23Win1, Time28Win1, 
                        Time38Win1, Time43Win1, Time53Win1, Time58Win1])

Part2_Win3 = pd.concat([Time3Win3, Time18Win3, Time33Win3, Time48Win3,
                        Time13Win3, Time28Win3, Time43Win3])

Part2_Win6 = pd.concat([Time3Win6, Time18Win6, Time33Win6, Time48Win6,
                        Time13Win6, Time43Win6])

Part2_Win12 = pd.concat([Time3Win12, Time18Win12, Time33Win12, Time48Win12])

In [177]:
drop = ['time','orig_time']
Part2_Win1 = Part2_Win1.drop(columns=drop)
Part2_Win3 = Part2_Win3.drop(columns=drop)
Part2_Win6 = Part2_Win6.drop(columns=drop)
Part2_Win12 = Part2_Win12.drop(columns=drop)

In [180]:
Part2_Win1.to_csv('Part2_Win1.csv')
Part2_Win3.to_csv('Part2_Win3.csv')
Part2_Win6.to_csv('Part2_Win6.csv')
Part2_Win12.to_csv('Part2_Win12.csv')

Check number of lines in one dataset

In [92]:
!wc -l Part2_Win12.csv

   35729 Part2_Win12.csv
