<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

In [None]:
# See full display of columns in dataframe for a better working flow.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Libraries <a class="anchor" id="first-bullet"></a>

In [None]:
# data processding
from modules import *

# HMDA Data Loading <a class="anchor" id="second-bullet"></a>

In [None]:
# Load the initial HMDA dataset in Calfornia, 
# with preselcted loan type (first-lien conventional) and purpose(purchase) during downloading
y = pd.read_csv("/content/drive/My Drive/state_CA_loan_types_1_loan_purposes_1.csv",
                dtype={'applicant_ethnicity-1': 'float64','applicant_race-1': 'float64',
                       'census_tract': 'float64','co-applicant_ethnicity-1': 'float64',
                       'co-applicant_race-1': 'float64','loan_amount': 'float64'},low_memory = False)

In [None]:
ny = pd.read_csv("/content/drive/My Drive/state_NY_loan_types_1_loan_purposes_1.csv",
                dtype={'applicant_ethnicity-1': 'float64','applicant_race-1': 'float64',
                       'census_tract': 'float64','co-applicant_ethnicity-1': 'float64',
                       'co-applicant_race-1': 'float64','loan_amount': 'float64'},low_memory = False)

In [None]:
cany = pd.concat([y, ny], ignore_index=True, sort=False)


In [None]:
cany.head()

In [None]:
# Create a HMDA ID for each individual loan
cany.insert(0, 'HMDA_ID', range(1, 1+ len(cany)))

In [None]:
# read the census track to zip code crosswalk file
equiv = pd.read_excel('/content/drive/My Drive/TRACT_ZIP_122018.xlsx')

In [None]:
# have a quick look
equiv.head()

In [None]:
HMDA_CANY = pd.merge(cany,equiv,left_on='census_tract',right_on = 'tract')

In [None]:
HMDA_CANY.to_csv('HMDA_CA+NY_2018.csv', index=False)
files.download('HMDA_CA+NY_2018.csv')

In [None]:
len(HMDA_CANY.HMDA_ID.unique())

When we look at the total number of unique HMDA_ID. We see it decrased from total amount of HMDA_ID of 1million to 690,331, 
meaning the HMDA dataset actually creates lots of one-to-many cases  when translating census track to zip codes. 


In [None]:
# get the translation dictionary 
#equiv_dic = equiv.groupby('tract')['zip'].apply(list).to_dict()

# Corelogic Dataset Loading 

In [None]:
# Load the Logic dataset updated in 2019 January
Logic = dd.read_csv("/content/drive/My Drive/Recent_Origination_Firsts_201901_361.txt",sep = '|',dtype={'cbsa': 'float64',
       'loan_purpose': 'object','loan_type': 'object', 'maturity_date': 'float64',
       'msa': 'float64','original_term': 'float64','origination_date': 'float64',
       'payment_frequency': 'object','product_type': 'object','property_zip': 'object',
       'first_payment_date': 'float64','documentation_type': 'object',
       'gse_eligible_flag': 'object','number_of_units': 'float64'},low_memory = False)

In [None]:
# select the California state subset 
Logic_CA=Logic[Logic.state == 'CA'].compute()

In [None]:
Logic_NY = Logic[Logic.state == 'NY'].compute()

In [None]:
# have a look at the total number of cases in the Calornia dataset.
# STILL HUGE!
print(f'Number of rows: {len(Logic_CA):,}.')

In [None]:
# restrict it to origination year 2018
Logic_CA_2018 = Logic_CA[Logic_CA.origination_date >= 201801]
Logic_CA_2018 = Logic_CA_2018[Logic_CA_2018.origination_date < 201901]

In [None]:
# save CSV file for potential later resue
Logic_CA_2018.to_csv('CA_2018_complete.csv', index=False)
files.download('CA_2018_complete.csv')

In [None]:
Logic_NY_2018 = Logic_NY[Logic_NY.origination_date >= 201801]
Logic_NY_2018 = Logic_NY_2018[Logic_NY_2018.origination_date < 201901]

In [None]:
Logic_NY_2018.to_csv('Logic_NY_2018.csv', index=False)
files.download('Logic_NY_2018.csv')

In [None]:
core_logic_ny_2018 = pd.read_csv("/content/drive/My Drive/Logic_NY_2018.csv",dtype={'cbsa': 'float64',
       'loan_purpose': 'object','loan_type': 'object', 'maturity_date': 'float64',
       'msa': 'float64','original_term': 'float64','origination_date': 'float64',
       'payment_frequency': 'object','product_type': 'object','property_zip': 'object',
       'first_payment_date': 'float64','documentation_type': 'object',
       'gse_eligible_flag': 'object','number_of_units': 'float64'},low_memory=False)

In [None]:
core_logic_ca_2018 = pd.read_csv("/content/drive/My Drive/CA_2018_complete (1).csv",dtype={'cbsa': 'float64',
       'loan_purpose': 'object','loan_type': 'object', 'maturity_date': 'float64',
       'msa': 'float64','original_term': 'float64','origination_date': 'float64',
       'payment_frequency': 'object','product_type': 'object','property_zip': 'object',
       'first_payment_date': 'float64','documentation_type': 'object',
       'gse_eligible_flag': 'object','number_of_units': 'float64'},low_memory=False)

In [None]:
core_logic_cany_2018 = pd.concat([core_logic_ny_2018, core_logic_ca_2018], ignore_index=True, sort=False)

In [None]:
# select useful columns
core_logic_cany_2018 = core_logic_cany_2018[['loan_id','property_zip','state','property_type','number_of_units','occupancy_type',
                              'origination_date','sale_price','appraised_value','original_balance','original_term',
                              'loan_purpose','loan_type','original_ltv','fico_score_at_origination','lien',
                              'msa','loan_purpose_category','initial_interest_rate']]

# Data manipulation

## HMDA

I further restrict the dataset to specific loan types for analysis simplicity (detailed discussion in the draft). I may explore and include more loans types in the next iteration. 

In [None]:
# HMDA loan application that were actually generated
HMDA_originated = HMDA_CANY[HMDA_CANY.action_taken == 1]

In [None]:
rejected = HMDA_CANY[HMDA_CANY.action_taken != 1]

In [None]:
total = HMDA_CANY['derived_race'].value_counts()
n_rejected = rejected['derived_race'].value_counts()
print(reject_p)
reject_p = np.divide(n_rejected,total)
reject_p.plot(kind='bar')
plt.title('Percentage of Rejection for Different Racial Groups')
plt.xlabel('Racial Groups')
plt.ylabel('Percentage of Rejection')
plt.show()

In [None]:
# to first lien conventional loan in HMDA
HMDA_originated_1st = HMDA_originated[HMDA_originated.derived_loan_product_type == 'Conventional:First Lien']

In [None]:
# to only single family house in HMDA
HMDA_originated_1st = HMDA_originated_1st[HMDA_originated_1st['derived_dwelling_category']=='Single Family (1-4 Units):Site-Built']

In [None]:
# non-business use only
HMDA_originated_1st = HMDA_originated_1st[HMDA_originated_1st.business_or_commercial_purpose == 2]

## Logic

In [None]:
# only sected conventional, first lien mortgage loans
core_logic_cany_20181st = core_logic_cany_2018[core_logic_cany_2018.loan_type == '1']

In [None]:
# restrict to purchase purpose only
core_logic_cany_20181st = core_logic_cany_20181st[core_logic_cany_20181st.loan_purpose_category == 'P']

In [None]:
# drop the string in the property_zip and then change the data type to numeric form
i = core_logic_cany_20181st[(core_logic_cany_20181st.property_zip == 'CA000')].index
j = core_logic_cany_20181st[(core_logic_cany_20181st.property_zip == 'NY000')].index
core_logic_cany_20181st = core_logic_cany_20181st.drop(i)
core_logic_cany_20181st = core_logic_cany_20181st.drop(j)
core_logic_cany_20181st['property_zip'] = pd.to_numeric(core_logic_cany_20181st['property_zip'])

In [None]:
# round loan amount to the nearest 1000
core_logic_cany_20181st['loan_rounded'] = core_logic_cany_20181st.original_balance.round(-3)

In [None]:
# drop null values in the loan_rounded column since we will merge based on it soon
core_logic_cany_20181st = core_logic_cany_20181st[core_logic_cany_20181st['loan_rounded'].notna()]
# restrict the property type to single family house
core_logic_cany_20181st = core_logic_cany_20181st[core_logic_cany_20181st['property_type']== '1']

### Merge

In [None]:
# merge two datasets based on loan amount and zip codes
merged = pd.merge(HMDA_originated_1st, core_logic_cany_20181st, left_on=['loan_amount','zip'],
                  right_on = ['loan_rounded','property_zip'])

In [None]:
# We are left with almost 400,000 potential matches
merged.info()

In [None]:
# have a look at the amount of entries with unique HMDA_ID
len(merged.HMDA_ID.unique())

In [None]:
# have a look at the amount of entries with unique CoreLogic ID
len(merged.loan_id.unique())

In [None]:
# reorganize the merged dataset for better visualization and comparison
merged_organized = merged[['HMDA_ID','loan_id','loan_to_value_ratio','original_ltv', 'loan_amount', 'original_balance','loan_rounded',
 'zip', 'property_zip','property_value', 'appraised_value', 'sale_price','initial_interest_rate','interest_rate',                   
 'state','property_type','number_of_units','occupancy_type_x','origination_date','original_term','loan_purpose_x', 'loan_type_x',
 'fico_score_at_origination','lien','msa', 'loan_purpose_category','derived_msa-md', 'census_tract','derived_loan_product_type',
 'derived_dwelling_category','derived_ethnicity','derived_race', 'action_taken','loan_type_y','loan_purpose_y','lien_status',
 'business_or_commercial_purpose','loan_term','occupancy_type_y','debt_to_income_ratio','denial_reason-1','income']]

### Filtering based on loan to value ratio

In [None]:
# data type from object to float for the variable loan_to_value ratio.
k = merged_organized[(merged_organized.loan_to_value_ratio == 'Exempt')].index
merged_organized = merged_organized.drop(k)
merged_organized['loan_to_value_ratio']= merged_organized.loan_to_value_ratio.astype(float)

In [None]:
# calculate the differnce of ltv between HMDA and CoreLogic Dataset
merged_organized['difference_ltv'] = abs(merged_organized['loan_to_value_ratio'] - merged_organized['original_ltv'])

In [None]:
# filter out matches with difference larger than 1
merged_organized_drop = merged_organized.loc[(merged_organized['difference_ltv']<1)]

In [None]:
merged_organized_drop.head()

In [None]:
# data type from object to float for the variable loan_term.
j = merged_organized_drop[(merged_organized_drop.loan_term == 'Exempt')].index
merged_organized_drop = merged_organized_drop.drop(j)
merged_organized_drop['loan_term'] = merged_organized_drop.loan_term.astype(float)

In [None]:
# filtering out loans that are unqual in loan terms between two datasets
merged_organized_drop_equal = merged_organized_drop.loc[merged_organized_drop['loan_term']
                                                        == merged_organized_drop['original_term']]

As we can see, these matched loans have the same loan amount, zip code,and loan term, as well as very close ltv ratios.

### Find Non-duplicates

In [None]:
# find duplicates row based on loan_id
duplicateRows = merged_organized_drop_equal[merged_organized_drop_equal.duplicated(['loan_id'],keep = False)]

In [None]:
duplicateRows.head()

In [None]:
# find non-duplicates row based on loan_id
nonduplicate_loan_id= merged_organized_drop_equal.drop_duplicates(subset=['loan_id'], keep=False)

In [None]:
nonduplicate_loan_id.head()

In [None]:
# find non-duplicates row after soring on loan_id
nonduplicate_final= nonduplicate_loan_id.drop_duplicates(subset=['HMDA_ID'], keep=False)

In [None]:
# look at the number of unique HMDA ID in the nonduplicate datasets
len(nonduplicate_final.HMDA_ID.unique())

In [None]:
# look at the number of unique HMDA ID in the nonduplicate datasets
len(nonduplicate_final.loan_id.unique())

We are confident that the nonduplicate_final are uniquely matched dataset

### Handle Duplicates

In [None]:
# change data tyoe of interest_rate to float
duplicateRows['interest_rate'] = duplicateRows['interest_rate'].astype(float)
#fill the null value in the interest_rate columns with mean values in the duplicated dataset
duplicateRows['interest_rate'].fillna((duplicateRows['interest_rate'].mean()), inplace=True)
# calculate the interest difference between two datasets in the matched duplicates
duplicateRows['interest_diff'] = abs(duplicateRows['initial_interest_rate'] - duplicateRows['interest_rate'])

In [None]:
# sort the duplication dataset based on interest rate difference, and only keep the one with smallest difference
deplicateRows = duplicateRows.sort_values('interest_diff').drop_duplicates(subset=['loan_id'], keep='first')

In [None]:
# Now loan_id in Corelogic dataset are all unique in this dataset
deplicateRows.info()

In [None]:
len(deplicateRows.loan_id.unique())

In [None]:
# now processing duplicated hmda_id
nonduplicate_hmda= deplicateRows.drop_duplicates(subset=['HMDA_ID'], keep=False)

In [None]:
len(nonduplicate_hmda.HMDA_ID.unique())

In [None]:
len(nonduplicate_hmda.loan_id.unique())

### Reduce duplicates HMDA

In [None]:
#find duplicates based on HMDA_ID
HMDA_dupli = deplicateRows[deplicateRows.duplicated(['HMDA_ID'],keep = False)]

In [None]:
# similar approach filtering based on the interest rate differences, for each group of duplicates, only keey the
# match with the smallest difference of interest rate
hmda_sorted = HMDA_dupli.sort_values('interest_diff').drop_duplicates(subset=['HMDA_ID'], keep='first')

In [None]:
nondup = pd.concat([nonduplicate_final,nonduplicate_hmda,hmda_sorted],ignore_index=True)

In [None]:
# drop duplicated HMDA_ID based on ranks of the interest difference.
final_matched = nondup.sort_values('interest_diff').drop_duplicates(subset=['HMDA_ID'])

A uniquely matched dataset is finall here!

### Merge with CoreLogic Performance Dataset 

In [None]:
#loan
Performance = dd.read_csv("/content/drive/My Drive/Performance_Firsts_201904_364.txt",sep = '|',
                         dtype={'last_paid_interest_date': 'float64',
       'loan_age': 'float64',
       'mba_days_delinquent': 'float64',
       'mba_worst_ever': 'float64',
       'ots_days_delinquent': 'float64',
       'ots_worst_ever': 'float64'},low_memory=False)

In [None]:
# restrict loan age to shorter than 20 months, so that we can be sure it's roughly in 2018, 
# to reduce computation later during merge
Performance = Performance[Performance.loan_age <=20].compute()

In [None]:
# Final merge between matched HMDA-CoreLogic.Origination and CoreLogic.Performance datasets
Merge_op = pd.merge(Performance,final_matched,on=['loan_id'])

In [None]:
Merge_op['mba_delinquency_status'].value_counts()

### Comparison of matched dataset and the HMDA dataset before matching 

In [None]:
# racial percentage
Merge_op['derived_race'].value_counts(normalize = True)

In [None]:
HMDA_originated_1st['derived_race'].value_counts(normalize = True)

In [None]:
# remove extreme values and see the income statistics summary in the merged dataset
income_merged = Merge_op['income']
removed_outliers = income_merged.between(income_merged.quantile(.05), income_merged.quantile(.95))
income_merged[removed_outliers].describe()

In [None]:
# remove extreme values and see the income statistics summary in the merged dataset

HMDA_income = HMDA_originated['income']
HMDA_outlier_removed = HMDA_income.between(HMDA_income.quantile(.05), HMDA_income.quantile(.95))
HMDA_income[HMDA_outlier_removed].describe()

In [None]:
# plot historgram of distribution of income
plt.hist([income_merged[removed_outliers],HMDA_income[HMDA_outlier_removed]],density=True)
plt.title('Income Distribution in Merged and Original HMDA Datasets')
plt.xlabel('Income in Thousands')
plt.ylabel('Density')
plt.show()

In [None]:
black_matched = Merge_op.loc[Merge_op['derived_race'] == 'Black or African American']
black_HMDA = HMDA_originated_1st.loc[HMDA_originated_1st['derived_race'] == 'Black or African American']

In [None]:
black_matched_income = black_matched['income']
black_HMDA_income = black_HMDA['income']
black_matched_95 = black_matched_income.between(black_matched_income.quantile(.05), black_matched_income.quantile(.95))
black_HMDA_income_95 = black_HMDA_income.between(black_HMDA_income.quantile(.05), black_HMDA_income.quantile(.95))

In [None]:
black_matched_income[black_matched_95].describe()

In [None]:
black_HMDA_income[black_HMDA_income_95].describe()

In [None]:
# plot historgram of distribution of income
plt.hist([black_matched_income[black_matched_95],black_HMDA_income[black_HMDA_income_95]],density=True)
plt.title('Income Distribution in Merged and Original HMDA Datasets for African Americans')
plt.xlabel('Income in Thousands')
plt.ylabel('Density')
#plt.savefig('/Users/tony/Downloads/black_income.png')
plt.show()

In [None]:
white_matched = Merge_op.loc[Merge_op['derived_race'] == 'White']
white_HMDA = HMDA_originated_1st.loc[HMDA_originated_1st['derived_race'] == 'White']
white_matched_income = white_matched['income']
white_HMDA_income = white_HMDA['income']
white_matched_95 = white_matched_income.between(white_matched_income.quantile(.05), white_matched_income.quantile(.95))
white_HMDA_income_95 = white_HMDA_income.between(white_HMDA_income.quantile(.05), white_HMDA_income.quantile(.95))

In [None]:
# plot historgram of distribution of income
plt.hist([white_matched_income[white_matched_95],white_HMDA_income[white_HMDA_income_95]],density=True)
plt.title('Income Distribution in Merged and Original HMDA Datasets for White')
plt.xlabel('Income in Thousands')
plt.ylabel('Density')
#plt.savefig('/Users/tony/Downloads/white_income.png')
plt.show()

In [None]:
#scatter_matrix(HMDA, figsize=(10, 10))

In [None]:
# correlation matrix
trainingsample_corr = trainingsample[['minority','rtdum','propertycounty','mi','paymentcredithistory','nounitsinproperty','dprop']]
mask = np.triu(np.ones_like(trainingsample_corr.corr(), dtype=np.bool))
heatmap = sns.heatmap(trainingsample_corr.corr(),vmin=-1, vmax=1,cmap='BrBG',mask=mask)
heatmap.set_title('Triangle Correlation Heatmap Between Being Minority and Selected Covariates', fontdict={'fontsize':12}, pad=16)
plt.savefig('heatmap_race.png', dpi=1000, bbox_inches='tight')
plt.show()

## Predictive Algorithms

In [None]:
# have a quick look at the merged dataset
Merge_op.head()

In [None]:
# get to know at the missing value percentage
pd.set_option("display.max_columns", 70)

In [None]:
percentage = pd.DataFrame(
{"Approved Missing No.":Merge_op.isnull().sum(),
"Approved Missing %":Merge_op.isnull().sum()/len(Merge_op)})

In [None]:
print(percentage)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
X_sm,y_sm = smote.fit_sample(x, y)
print(X_sm.shape[0] - x.shape[0], 'new random picked points')

### Without Race Prediction on Random Forest

In [None]:
# features we want to include in predicting uses
features = ['income','fico_score_at_origination','original_ltv','loan_amount','loan_type_x',
             'loan_purpose_x','mba_delinquency_status','loan_term','property_value']
# dataset with selected features
selected = Merge_op[features]

In [None]:
# have a quic look at the head
selected.head()

In [None]:
# drop null values
nonna = selected.dropna()
nonna.info()

In [None]:
# into two groups: delinquent nor not
nonna['mba_delinquency_status'].replace({'3':1,'6':1,'9':1,'F':1,'C':0,'S':0,'T':1,'0':0},inplace = True)
# features
x = nonna[['income','fico_score_at_origination','original_ltv','loan_amount','loan_term','property_value']]
#labels
y = nonna['mba_delinquency_status']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3,random_state = 1)

In [None]:
# change datatype for model fitting in random forest
y_train = y_train.astype(int)
# the actual random forest model
rf_clf = RandomForestClassifier(n_estimators=400,max_features=0.25,criterion="entropy")
pip_baseline = make_pipeline(RobustScaler(), rf_clf)
#base line model performances
scores = cross_val_score(pip_baseline,x_train, y_train,scoring="roc_auc", cv=10)
print(f"Model's average AUC: {scores.mean():.3f}")

In [None]:
# fit for feature importance
rf_clf.fit(x_train,y_train)

# Plot the features importance
importances = rf_clf.feature_importances_
indices = np.argsort(rf_clf.feature_importances_)[::-1]
plt.figure(figsize=(12, 6))
plt.bar(range(1, 7), importances[indices], align="center")
plt.xticks(range(1, 7), x.columns[indices], rotation=90)
plt.title("Feature Importance", {"fontsize": 16})

### Random Forest with Race

In [None]:
# similar code as above, except we have now included race as a dependent vairable
Merge_op['derived_race'] = Merge_op['derived_race'].astype('category')
features_2 = ['income','fico_score_at_origination','original_ltv','loan_amount','loan_type_x',
             'loan_purpose_x','mba_delinquency_status','loan_term','property_value','zip','derived_race']
selected_2 = Merge_op[features_2]

nonna_2 = selected_2.dropna()
nonna_2['mba_delinquency_status'].replace({'3':1,'6':1,'9':1,'F':1,'C':0,'S':0,'T':1,'0':0},inplace = True)

array = ['White','American Indian or Alaska Native','Black or African American','Native Hawaiian or Other Pacific Islander']
nonna_2 = nonna_2.loc[nonna_2['derived_race'].isin(array)]

nonna_2['derived_race'].replace({'White':0,'American Indian or Alaska Native':1,
                                 'Black or African American':1,'Native Hawaiian or Other Pacific Islander':1},inplace = True)

x_2= nonna_2[['income','fico_score_at_origination','original_ltv','loan_amount','loan_term','property_value','derived_race','zip']]
feature_2 = ['income','fico_score_at_origination','original_ltv','loan_amount','loan_term','property_value','derived_race','zip']
y_2 = nonna_2['mba_delinquency_status']

In [None]:
x_2_sm,y_2_sm = smote.fit_sample(x_2, y_2)
#ros = RandomOverSampler()
#x_2_sm,y_2_sm = ros.fit_sample(x_2, y_2)

In [None]:
#x_2["derived_race"] = x_2["derived_race"].cat.codes
x_train, x_test, y_train, y_test = train_test_split(x_2_sm, y_2_sm, test_size=0.3,random_state = 1)
y_train = y_train.astype(int)
rf_clf = RandomForestClassifier(n_estimators=400,max_features=0.25, criterion="entropy")
#rf_clf = RandomForestClassifier(n_estimators=400,max_features=0.25, criterion="entropy",class_weight="balanced")

In [None]:
# make the default status into int data type for model needs
y_train = y_train.astype(int)
# fit model and evaluate performances
pip_baseline = make_pipeline(RobustScaler(), rf_clf)
scores = cross_val_score(pip_baseline,x_train, y_train,scoring="roc_auc")
print(f"Model's average AUC: {scores.mean():.3f}")

In [None]:
# fit RF to plot feature importances
my_model = rf_clf.fit(x_train,y_train)

# Plot features importance
importances = rf_clf.feature_importances_
indices = np.argsort(rf_clf.feature_importances_)[::-1]
plt.figure(figsize=(12, 6))
plt.bar(range(1, 9), importances[indices], align="center")
plt.xticks(range(1, 9), x_2.columns[indices], rotation=90)
plt.title("Feature Importance", {"fontsize": 16})
plt.show()

In [None]:
explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values[1], x_test, feature_names=feature_2)

### Logit Regression with and without race

In [None]:
#Standard Scaler for scaling the traininga and testing data seperately, to prevent informationl leakage
sds = StandardScaler()
x_train =sds.fit_transform(x_train)

In [None]:
# the LASSO model
glm = LogisticRegression(C=1,solver='liblinear', class_weight='balanced')
glm.fit(x_train, y_train)
x_test =sds.fit_transform(x_test)
pred = glm.predict(x_test)

In [None]:
print ("The accuracy score is {:.3f}".format(accuracy_score(y_test, pred)))
print ("\n")
print ("Classification report:")
print(classification_report(y_test, pred))

In [None]:
# Build the confusion matrix.
matrix = confusion_matrix(y_test, pred)
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# The heatmap requires that we pass in a dataframe as the argument
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu", fmt="g")

# Configure the heatmap parameters
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
col_trans_race = ['income','fico_score_at_origination','original_ltv','loan_amount','property_value','loan_term','zip','derived_race']
x_train[col_trans_race] =sds.fit_transform(x_train[col_trans_race])
glm.fit(x_train, y_train)
x_test[col_trans] =sds.fit_transform(x_test[col_trans])
pred = glm.predict(x_test)

In [None]:
print ("The accuracy score is {:.3f}".format(accuracy_score(y_test, pred)))
print ("\n")
print ("Classification report:")
print(classification_report(y_test, pred))
sns.heatmap(x_train[col_trans_race].corr())
plt.show()