# Introduction
In this notebook , we are going to do following steps. 
- Drop irrelevant columns.
- Create dummy or indicator features for categorical variables
- Standardize the magnitude of numeric features using a scaler
- Split your data into testing and training datasets

In [54]:
#Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [55]:
#Load the data
df_2019=pd.read_csv('Data/loan_2019_feature.csv')

In [56]:
#Exploring the data
df_2019.head().transpose()

Unnamed: 0,0,1,2,3,4
Year,2019,2019,2019,2019,2019
LoanCharacteristicsID,2407198,2407199,2407200,2407201,2407202
Bank,Boston,Boston,Boston,Boston,Boston
FIPSStateNumericCode,50,50,50,50,23
FIPSCountyCode,7,7,15,17,31
CoreBasedStatisticalAreaCode,15540,15540,99999,30100,38860
CensusTractIdentifier,34,29,9532,9594,254
CensusTractMinorityRatioPercent,4.577,3.421,6.152,3.935,6.031
CensusTractMedFamIncomeAmount,132474,100795,49625,62643,87679
LocalAreaMedianIncomeAmount,84290,84290,67566,67105,76445


In [57]:
#Load the data
col_dtype=pd.read_csv('Data/col_dtype.csv')

In [58]:
col_dtype

Unnamed: 0,Column_Name,Type
0,Year,drop
1,LoanCharacteristicsID,drop
2,Bank,cat
3,FIPSStateNumericCode,no_cn
4,FIPSCountyCode,no_cn
5,CoreBasedStatisticalAreaCode,no_cn
6,CensusTractIdentifier,no_cn
7,CensusTractMinorityRatioPercent,num
8,CensusTractMedFamIncomeAmount,drop
9,LocalAreaMedianIncomeAmount,num


In [59]:
#Function to convert col into categorical
def convert_to_cat(df_2019,col):
    df_2019[col]=df_2019[col].astype('category')
    return df_2019

In [60]:
#Drop rows with No race info.
df_2019=df_2019[df_2019["Borrower1Race1Type"]!='No Information']
df_2019.shape

(82415, 55)

In [61]:
#Convert Categorical variables
df_2019_dumm=df_2019.copy()
for i in range(col_dtype.shape[0]):
    col=col_dtype.iloc[i,:].Column_Name
    if col_dtype.iloc[i,:].Type == 'cat' and col_dtype.iloc[i,:].Column_Name != "Borrower1Race1Type" :
        df_2019_dumm=convert_to_cat(df_2019_dumm,col)
    elif col_dtype.iloc[i,:].Type == 'drop':
        df_2019_dumm.drop([col], inplace=True, axis=1)

In [62]:
df_2019_dumm

Unnamed: 0,Bank,FIPSStateNumericCode,FIPSCountyCode,CoreBasedStatisticalAreaCode,CensusTractIdentifier,CensusTractMinorityRatioPercent,LocalAreaMedianIncomeAmount,TotalMonthlyIncomeAmount,HUDMedianIncomeAmount,LoanAcquisitionActualUPBAmt,...,Borrower1CreditScoreValue,Borrower2CreditScoreValue,PMICoveragePercent,EmploymentBorrowerSelfEmployed,PropertyType,Borrower1EthnicityType,HOEPALoanStatusType,LienPriorityType,TotalYearlyIncomeAmount,Diff_median_yearly_income
0,Boston,50,7,15540,34.0,4.577,84290.0,7308,93000,319113,...,1,9,0.0,1,PT01,2,2,1,87696,3406.0
1,Boston,50,7,15540,29.0,3.421,84290.0,7416,93000,248005,...,3,2,0.0,1,PT01,2,2,1,88992,4702.0
2,Boston,50,15,99999,9532.0,6.152,67566.0,5143,71900,211105,...,2,3,0.0,1,PT01,2,2,1,61716,-5850.0
3,Boston,50,17,30100,9594.0,3.935,67105.0,8866,71900,210000,...,4,2,0.0,1,PT01,2,2,1,106392,39287.0
4,Boston,23,31,38860,254.0,6.031,76445.0,15240,90100,233200,...,5,9,0.0,1,PT01,2,2,1,182880,106435.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89762,Topeka,31,111,35820,9599.0,17.119,69210.0,3275,70000,133000,...,5,9,30.0,0,PT01,2,2,1,39300,-29910.0
89763,Topeka,31,111,35820,9606.0,5.698,69210.0,11500,70000,295000,...,5,5,6.0,0,PT01,2,2,1,138000,68790.0
89764,Topeka,31,13,99999,9512.0,13.887,73964.0,4298,75200,77250,...,5,5,0.0,0,PT01,2,2,1,51576,-22388.0
89765,Topeka,31,111,35820,9598.0,6.373,69210.0,6810,70000,207100,...,4,4,30.0,1,PT01,2,2,1,81720,12510.0


In [63]:
df_2019_dumm.dtypes

Bank                               category
FIPSStateNumericCode                  int64
FIPSCountyCode                        int64
CoreBasedStatisticalAreaCode          int64
CensusTractIdentifier               float64
CensusTractMinorityRatioPercent     float64
LocalAreaMedianIncomeAmount         float64
TotalMonthlyIncomeAmount              int64
HUDMedianIncomeAmount                 int64
LoanAcquisitionActualUPBAmt           int64
LTVRatioPercent                     float64
NoteDate                           category
LoanAcquistionDate                 category
LoanPurposeType                    category
ProductCategoryName                category
MortgageType                       category
ScheduledTotalPaymentCount            int64
MortgageLoanSellerInstType         category
BorrowerCount                      category
BorrowerFirstTimeHomebuyer         category
Borrower1Race1Type                   object
Borrower1GenderType                category
Borrower2GenderType             

In [65]:
#Check NA values
import numpy as np
np.sum(df_2019_dumm.isna())

Bank                               0
FIPSStateNumericCode               0
FIPSCountyCode                     0
CoreBasedStatisticalAreaCode       0
CensusTractIdentifier              0
CensusTractMinorityRatioPercent    0
LocalAreaMedianIncomeAmount        6
TotalMonthlyIncomeAmount           0
HUDMedianIncomeAmount              0
LoanAcquisitionActualUPBAmt        0
LTVRatioPercent                    0
NoteDate                           0
LoanAcquistionDate                 0
LoanPurposeType                    0
ProductCategoryName                0
MortgageType                       0
ScheduledTotalPaymentCount         0
MortgageLoanSellerInstType         0
BorrowerCount                      0
BorrowerFirstTimeHomebuyer         0
Borrower1Race1Type                 0
Borrower1GenderType                0
Borrower2GenderType                0
Borrower1AgeAtApplicationYears     0
Borrower2AgeAtApplicationYears     0
PropertyUsageType                  0
PropertyUnitCount                  0
N

In [66]:
#Drop NA rows
data=df_2019_dumm[~df_2019_dumm["LocalAreaMedianIncomeAmount"].isna()]

In [67]:
#Import Libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [68]:
#Train test Split
y=data["Borrower1Race1Type"]
X=data.loc[:, data.columns != 'Borrower1Race1Type']
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

In [72]:
enc = OneHotEncoder()
X_train_end=enc.fit_transform(X_train)

In [74]:
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train_end)

In [77]:
pipe = Pipeline([('enc',OneHotEncoder()),('scaler', StandardScaler(with_mean=False)), ('svc', SVC())])

In [None]:
pipe.fit(X_train, y_train)

In [53]:
pipe.score(X_test, y_test)

<82409x184917 sparse matrix of type '<class 'numpy.float64'>'
	with 3378769 stored elements in Compressed Sparse Row format>

In [13]:
#Save the dataframe
df_2019_dumm.to_csv("Preprocessed_data.csv",index=False)

# Train Test Split

In [15]:

X.head()
list(X.columns)

['FIPSStateNumericCode',
 'FIPSCountyCode',
 'CoreBasedStatisticalAreaCode',
 'CensusTractIdentifier',
 'CensusTractMinorityRatioPercent',
 'CensusTractMedFamIncomeAmount',
 'LocalAreaMedianIncomeAmount',
 'TotalMonthlyIncomeAmount',
 'HUDMedianIncomeAmount',
 'LoanAcquisitionActualUPBAmt',
 'LTVRatioPercent',
 'ScheduledTotalPaymentCount',
 'Borrower1AgeAtApplicationYears',
 'Borrower2AgeAtApplicationYears',
 'PropertyUnitCount',
 'NoteRatePercent',
 'NoteAmount',
 'HousingExpenseRatioPercent',
 'TotalDebtExpenseRatioPercent',
 'PMICoveragePercent',
 'TotalYearlyIncomeAmount',
 'Diff_median_yearly_income',
 'Boston',
 'Chicago',
 'Cincinnati',
 'Dallas',
 'Des Moines',
 'Indianapolis',
 'New York',
 'Pittsburgh',
 'San Francisco',
 'Topeka',
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2019,
 1,
 2,
 6,
 1,
 0,
 1,
 2,
 3,
 1,
 9,
 1,
 2,
 3,
 4,
 5,
 0,
 1,
 1,
 2,
 3,
 6,
 1,
 2,
 3,
 4,
 6,
 1,
 2,
 1,
 2,
 3,
 4,
 5,
 9,
 1,
 2,
 3,
 4,
 5,
 9,
 0,
 1,
 'PT01',
 'PT02',
 'PT03',
 'P

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Standarization 

In [17]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [18]:
X_train_scaled.shape

(61811, 101)

In [19]:
#Scale test data 
X_test_scaled = scaler.transform(X_test)

In [20]:
X_test_scaled.shape

(20604, 101)

In [21]:
y

0        White
1        White
2        White
3        White
4        White
         ...  
89762    White
89763    White
89764    White
89765    White
89766    White
Name: Borrower1Race1Type, Length: 82415, dtype: object