In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression, LogisticRegression
import warnings
warnings.filterwarnings('ignore')
import scorecardpy as sc
import pprint 

pd.set_option('display.max_columns', None)

In [None]:
full_data = pd.read_csv('IS453 Group Assignment - Data.csv')

#use a copy of hmeq_data for credit risk model
full_data = full_data.copy()

full_data.info()

In [None]:
full_data['OCCUPATION_TYPE'].value_counts()

In [None]:
working_set = full_data.copy()
working_set_orig = full_data.copy()

working_set.head()

In [None]:
working_set['FLAG_DOCUMENT_2'].head()

In [None]:
working_set.describe(include = 'all')

# 1. Data Preparation 


In [None]:
#Determine which variables are continuous variables 
cont_variables = working_set[["STATUS", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE", "REGION_POPULATION_RELATIVE", "EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3", "APARTMENTS_AVG"]]

#Check for highly correlated variables for CONTINUOUS VARIABLES ONLY 
cor = cont_variables.corr() 
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(cor,xticklabels=cor.columns,yticklabels=cor.columns,annot=True, ax=ax)
plt.show()


In [None]:
#Dropping Variables that result in biasedness and highly correlated ones
working_set = working_set.drop(columns=['CODE_GENDER', 'AMT_ANNUITY', 'AMT_GOODS_PRICE'], axis=1)

working_set.info()

In [None]:
# Inspect rows missing more than 4 values &  make sure not to reduce sample size too much 
rows_w_gt_3_na = working_set[working_set.isnull().sum(axis=1) > 3].shape[0]
print('Percent of total rows missing more than 3 values: ' + str("{:.1%}".format(rows_w_gt_3_na/working_set.shape[0])))
print(rows_w_gt_3_na)

In [None]:
working_set.dropna(thresh=working_set.shape[1]-3,inplace=True)
print("Starting row count: " + str(working_set_orig.shape[0]))
print("Ending row count: " + str(working_set.shape[0]))
print("Percent dropped: " + "{:.2%}".format(1-working_set.shape[0]/working_set_orig.shape[0]))

In [None]:
#Check % of missing column values again with revised dataset 
col_w_na = pd.DataFrame((working_set.isnull().sum().sort_values(ascending=False)/working_set.shape[0])).applymap("{0:.0%}".format)

col_w_na.head(10)

In [None]:
#Remove 3 variables as the amount of missing data is more than 50% 
working_set = working_set.drop(columns=['OWN_CAR_AGE','EXT_SOURCE_1'], axis=1)

working_set.info()

In [None]:
working_set['FLAG_DOCUMENT_2'].head()

In [None]:
#Feature Extraction -- Combinding the Flag Documents columns 
working_set['TOTAL_FLAG_DOCUMENTS'] = working_set['FLAG_DOCUMENT_2'] + working_set['FLAG_DOCUMENT_3'] + working_set['FLAG_DOCUMENT_4'] + working_set['FLAG_DOCUMENT_5'] + working_set['FLAG_DOCUMENT_6'] + working_set['FLAG_DOCUMENT_7'] + working_set['FLAG_DOCUMENT_8'] + working_set['FLAG_DOCUMENT_9'] + working_set['FLAG_DOCUMENT_10'] + working_set['FLAG_DOCUMENT_11'] + working_set['FLAG_DOCUMENT_12'] + working_set['FLAG_DOCUMENT_13'] + working_set['FLAG_DOCUMENT_14'] + working_set['FLAG_DOCUMENT_15'] + working_set['FLAG_DOCUMENT_16'] + working_set['FLAG_DOCUMENT_17'] + working_set['FLAG_DOCUMENT_18'] + working_set['FLAG_DOCUMENT_19'] + working_set['FLAG_DOCUMENT_20'] + working_set['FLAG_DOCUMENT_21']

working_set['TOTAL_FLAG_DOCUMENTS'].head()

working_set.drop('FLAG_DOCUMENT_2', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_3', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_4', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_5', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_6', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_7', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_8', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_9', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_10', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_11', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_12', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_13', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_14', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_15', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_16', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_17', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_18', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_19', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_20', inplace=True, axis=1)
working_set.drop('FLAG_DOCUMENT_21', inplace=True, axis=1)

working_set.head(3)

In [None]:
#Remove missing values 
working_set.dropna(inplace=True)
working_set.isna().sum()

In [None]:
working_set.info()

In [None]:
#Generate WOE Bins 

# automatically calculate bin ranges 
bins = sc.woebin(working_set, y='STATUS')

# make it easy to read the bins
for variables , bindetails in bins.items():
    print(variables , " : ")
    display(bindetails)
    print("--"*50)

In [None]:
#Train-test split 
train, test = sc.split_df(working_set, 'STATUS', ratio=0.7).values()
print(train.shape)
print(test.shape)

In [None]:
# prepare a dataset with the WOE values for Logistic Regression training
# woebin_ply() converts original values of input data into woe
train_woe = sc.woebin_ply(train, bins)
test_woe = sc.woebin_ply(test, bins)
train_woe

In [None]:
#create the X, y parts of data for train and test
y_train = train_woe.loc[:,'STATUS']
X_train = train_woe.loc[:,train_woe.columns != 'STATUS']
y_test = test_woe.loc[:,'STATUS']
X_test = test_woe.loc[:,train_woe.columns != 'STATUS']

#create a logistic regression model object
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("beta coefficients:")
print(lr.coef_)
print("alpha")
print(lr.intercept_)

In [None]:
# generate a card from the model and bins
card = sc.scorecard(bins, lr, X_train.columns, points0 = 600, odds0 = 1/20, pdo = 20,
      basepoints_eq0 = True)

pprint.pprint(card)

In [None]:
#Check the score card for occupation_type 
#See if drivers are discriminated against 

In [None]:
# credit score for samples in test and train
train_score = sc.scorecard_ply(train, card)
test_score = sc.scorecard_ply(test, card)

#distrubution of scores on test data
train_score.hist(figsize=(7,5),bins=60)
plt.title('train data scores')
test_score.hist(figsize=(7,5),bins=60)
plt.title('test data scores')

In [None]:
# print evaluation metrics of the model
y_pred = lr.predict(X_test)

print("Confusion matrix:")
print(confusion_matrix(y_pred, y_test))
print("\nPCC measures:")
print(classification_report(y_pred, y_test))
