# Introduction
In this notebook , we are going to do following steps. 
- Create dummy or indicator features for categorical variables
- Standardize the magnitude of numeric features using a scaler
- Split your data into testing and training datasets

In [105]:
#Import Libraries
import pandas as pd

In [106]:
#Load the data
df_2019=pd.read_csv('Data/2019_data.csv')

In [107]:
#Exploring the data
df_2019.head().transpose()

Unnamed: 0,0,1,2,3,4
Year,2019,2019,2019,2019,2019
LoanCharacteristicsID,2407198,2407199,2407200,2407201,2407202
Bank,Boston,Boston,Boston,Boston,Boston
FIPSStateNumericCode,50,50,50,50,23
FIPSCountyCode,7,7,15,17,31
CoreBasedStatisticalAreaCode,15540,15540,99999,30100,38860
CensusTractIdentifier,34,29,9532,9594,254
CensusTractMinorityRatioPercent,4.577,3.421,6.152,3.935,6.031
CensusTractMedFamIncomeAmount,132474,100795,49625,62643,87679
LocalAreaMedianIncomeAmount,84290,84290,67566,67105,76445


In [115]:
#Load the data
col_dtype=pd.read_csv('Data/col_dtype.csv')

In [116]:
col_dtype.T.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
Column_Name,Year,LoanCharacteristicsID,Bank,FIPSStateNumericCode,FIPSCountyCode,CoreBasedStatisticalAreaCode,CensusTractIdentifier,CensusTractMinorityRatioPercent,CensusTractMedFamIncomeAmount,LocalAreaMedianIncomeAmount,...,Borrower1Race3Type,Borrower1Race4Type,Borrower1Race5Type,Borrower2EthnicityType,Borrower2Race2Type,Borrower2Race3Type,Borrower2Race4Type,Borrower2Race5Type,HOEPALoanStatusType,LienPriorityType
Type,no_cn,no_cn,cat,no_cn,no_cn,no_cn,no_cn,num,num,num,...,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat


In [117]:
#Function to convert col into categorical
def convert_to_cat(df_2019,col):
    df_2019[col]=df_2019[col].astype('category')
    return df_2019

In [118]:
df_2019_dumy=df_2019.copy()
for i in range(col_dtype.shape[0]):
    if col_dtype.iloc[i,:].Type == 'cat':
        col=col_dtype.iloc[i,:].Column_Name
        print(col)
        print(df_2019[col].unique())

Bank
['Boston', 'Chicago', 'Cincinnati', 'Dallas', 'Des Moines', 'Indianapolis', 'New York', 'Pittsburgh', 'San Francisco', 'Topeka']
Categories (10, object): ['Boston', 'Chicago', 'Cincinnati', 'Dallas', ..., 'New York', 'Pittsburgh', 'San Francisco', 'Topeka']
NoteDate
[2018, 2019, 2017, 2016, 2014, 2015]
Categories (6, int64): [2018, 2019, 2017, 2016, 2014, 2015]
LoanAcquistionDate
[2019]
Categories (1, int64): [2019]
LoanPurposeType
[1, 6, 2]
Categories (3, int64): [1, 6, 2]
ProductCategoryName
[1]
Categories (1, int64): [1]
MortgageType
[1, 2, 0, 3]
Categories (4, int64): [1, 2, 0, 3]
MortgageLoanSellerInstType
[1, 9]
Categories (2, int64): [1, 9]
BorrowerCount
[1, 2, 3, 4, 5]
Categories (5, int64): [1, 2, 3, 4, 5]
BorrowerFirstTimeHomebuyer
[1, 0]
Categories (2, int64): [1, 0]
Borrower1Race1Type
[5, 3, 6, 2, 4, 1]
Categories (6, int64): [5, 3, 6, 2, 4, 1]
Borrower2Race1Type
[8, 5, 6, 2, 3, 1, 4, 7]
Categories (8, int64): [8, 5, 6, 2, 3, 1, 4, 7]
Borrower1GenderType
[2, 1, 3, 6]
C

In [119]:
#Convert Categorical variables to dummy variable
df_2019_dumm=df_2019.copy()
for i in range(col_dtype.shape[0]):
    if col_dtype.iloc[i,:].Type == 'cat':
        col=col_dtype.iloc[i,:].Column_Name
        df_2019=convert_to_cat(df_2019,col)
        just_dummies = pd.get_dummies(df_2019[col])
        df_2019_dumm = pd.concat([df_2019_dumm, just_dummies], axis=1) 
        df_2019_dumm.drop([col], inplace=True, axis=1)

In [120]:
df_2019_dumm.head()

Unnamed: 0,Year,LoanCharacteristicsID,FIPSStateNumericCode,FIPSCountyCode,CoreBasedStatisticalAreaCode,CensusTractIdentifier,CensusTractMinorityRatioPercent,CensusTractMedFamIncomeAmount,LocalAreaMedianIncomeAmount,TotalMonthlyIncomeAmount,...,6,7,8,6.1,7.1,8.1,1,2,3,1.1
0,2019,2407198,50,7,15540,34.0,4.577,132474,84290,7308,...,0,0,1,0,0,1,0,1,0,1
1,2019,2407199,50,7,15540,29.0,3.421,100795,84290,7416,...,0,1,0,0,1,0,0,1,0,1
2,2019,2407200,50,15,99999,9532.0,6.152,49625,67566,5143,...,0,1,0,0,1,0,0,1,0,1
3,2019,2407201,50,17,30100,9594.0,3.935,62643,67105,8866,...,0,1,0,0,1,0,0,1,0,1
4,2019,2407202,23,31,38860,254.0,6.031,87679,76445,15240,...,0,0,1,0,0,1,0,1,0,1


# Standardization


In [121]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df_2019_dumm)
X_scaled = scaler.transform(df_2019_dumm)

In [123]:
X_scaled

array([[ 0.        , -1.68283228,  1.30966108, ...,  0.05683172,
        -0.05573623,  0.        ],
       [ 0.        , -1.68279518,  1.30966108, ...,  0.05683172,
        -0.05573623,  0.        ],
       [ 0.        , -1.68275807,  1.30966108, ...,  0.05683172,
        -0.05573623,  0.        ],
       ...,
       [ 0.        ,  1.46275451,  0.07326502, ...,  0.05683172,
        -0.05573623,  0.        ],
       [ 0.        ,  1.46279162,  0.07326502, ...,  0.05683172,
        -0.05573623,  0.        ],
       [ 0.        ,  1.46282873,  0.07326502, ...,  0.05683172,
        -0.05573623,  0.        ]])