# Importing Required Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import scipy.stats as stat
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from scipy.stats import ks_2samp

sns.set()

# Reading Data

In [2]:
X_train_backup = pd.read_csv('X_train.csv')
X_test_backup = pd.read_csv('X_test.csv')

y_train_backup = pd.read_csv('y_train.csv', header=None, names=['Index', 'loan_status:Not_Default'])
y_test_backup = pd.read_csv('y_test.csv', header=None, names=['Index', 'loan_status:Not_Default'])

In [3]:
X_train = X_train_backup.set_index('Unnamed: 0')
X_test = X_test_backup.set_index('Unnamed: 0')

y_train = y_train_backup.set_index('Index')
y_test = y_test_backup.set_index('Index')

# Loading Pre-trained Model

In [4]:
filename = 'init_logistic_regression_model.sav'
log_reg_model = pickle.load(open(filename, 'rb'))



# Building Scorecard

In [6]:
feature_list = ['Intercept']
feature_list.extend(list(X_train.columns))

scorecard_df = pd.DataFrame(columns=['Feature', 'Coef', 'Og_Feature'])

scorecard_df['Feature'] = feature_list

coef_list = list(log_reg_model.intercept_)
coef_list.extend(list(log_reg_model.coef_.ravel()))
scorecard_df['Coef'] = coef_list

scorecard_df['Og_Feature'] = scorecard_df['Feature'].str.split(':', expand=True)[0]

ref_cat_scorecard_df = pd.read_csv('ref_cat_scorecard_df.csv')
scorecard_df = pd.concat((scorecard_df, ref_cat_scorecard_df), axis=0, ignore_index=True)

In [7]:
scorecard_df.head()

Unnamed: 0,Feature,Coef,Og_Feature
0,Intercept,-0.522228,Intercept
1,grade:A,1.435635,grade
2,grade:B,0.994351,grade
3,grade:C,0.6971,grade
4,grade:D,0.49355,grade


### Defining range

In [8]:
min_sum_coef = scorecard_df.groupby('Og_Feature')['Coef'].min().sum()
max_sum_coef = scorecard_df.groupby('Og_Feature')['Coef'].max().sum()

max_score = 850
min_score = 300

### Scaling Current Scores to this range

In [9]:
scorecard_df['Scores'] = (scorecard_df['Coef'] * ((max_score - min_score) / (max_sum_coef - min_sum_coef)))

scorecard_df.at[0, 'Scores'] = ( ((scorecard_df.at[0, 'Coef']-min_sum_coef)/(max_sum_coef-min_sum_coef)) * (max_score-min_score) ) + min_score

scorecard_df['Scores_int'] = scorecard_df['Scores'].round().astype('int64')

### Min score

In [10]:
scorecard_df.groupby('Og_Feature')['Scores_int'].min().sum()

301

### Max Score

In [11]:
scorecard_df.groupby('Og_Feature')['Scores_int'].max().sum()

850

### Correcting Min Score

In [12]:
scorecard_df.groupby('Og_Feature')['Scores_int', 'Scores'].min()

Unnamed: 0_level_0,Scores_int,Scores
Og_Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Intercept,344,343.962533
addr_state,-10,-10.03573
annual_inc,0,0.0
dti,-15,-15.477946
funded_amnt,0,0.0
grade,0,0.0
home_ownership,-13,-12.995513
initial_list_status,0,0.0
inq_last_6mths,0,0.0
installment,0,0.0


In [13]:
scorecard_df[scorecard_df['Og_Feature'] == 'dti']

Unnamed: 0,Feature,Coef,Og_Feature,Scores,Scores_int
67,dti:0_4,-0.098102,dti,-5.351194,-5
68,dti:4_8,-0.018628,dti,-1.016083,-1
69,dti:8_12,-0.055081,dti,-3.004526,-3
70,dti:12_16,-0.102227,dti,-5.5762,-6
71,dti:16_20,-0.14971,dti,-8.166225,-8
72,dti:20_24,-0.204624,dti,-11.161655,-11
73,dti:24_28,-0.266496,dti,-14.536583,-15
74,dti:28_32,-0.283754,dti,-15.477946,-15
75,dti:32_36,-0.277517,dti,-15.137767,-15
76,dti:36_40,-0.260991,dti,-14.236316,-14


In [14]:
scorecard_df.at[74, 'Scores_int'] = -16

### Corrected Min Score

In [15]:
scorecard_df.groupby('Og_Feature')['Scores_int'].min().sum()

300

### Max Score

In [16]:
scorecard_df.groupby('Og_Feature')['Scores_int'].max().sum()

850

In [17]:
scorecard_df.head()

Unnamed: 0,Feature,Coef,Og_Feature,Scores,Scores_int
0,Intercept,-0.522228,Intercept,343.962533,344
1,grade:A,1.435635,grade,78.309736,78
2,grade:B,0.994351,grade,54.238984,54
3,grade:C,0.6971,grade,38.024761,38
4,grade:D,0.49355,grade,26.921703,27


# Calculating Credit Score for all applicants in Test Set

In [18]:
test_score_df = pd.DataFrame(index=X_test.index, columns=['Intercept'])
test_score_df['Intercept'] = 1

test_score_df = pd.concat((test_score_df, X_test), axis=1)

ref_cat_list = list(ref_cat_scorecard_df['Feature'].values)
ref_cat_list_df = pd.DataFrame(0, index=X_test.index, columns=ref_cat_list)

test_score_df = pd.concat((test_score_df, ref_cat_list_df), axis=1)

test_score_df.head()

Unnamed: 0_level_0,Intercept,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:RENT_NONE_OWN,home_ownership:MORTGAGE,home_ownership:ANY,...,purpose:educational_small_business,initial_list_status:f,term:60,mths_issue_d:93+,int_rate:23+,funded_amnt:16300_36050,annual_inc:18500-,installment:1300+,inq_last_6mths:2+,dti:40+
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1758049,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
686533,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
900721,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1727912,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
539691,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
test_score_df.values.shape

(452134, 91)

In [20]:
scorecard_df['Scores_int'].values.shape

(91,)

In [21]:
test_score_df['credit_scores'] = test_score_df.values @ scorecard_df['Scores_int'].values

In [22]:
test_score_df.head()

Unnamed: 0_level_0,Intercept,grade:A,grade:B,grade:C,grade:D,grade:E,grade:F,home_ownership:RENT_NONE_OWN,home_ownership:MORTGAGE,home_ownership:ANY,...,initial_list_status:f,term:60,mths_issue_d:93+,int_rate:23+,funded_amnt:16300_36050,annual_inc:18500-,installment:1300+,inq_last_6mths:2+,dti:40+,credit_scores
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1758049,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,477
686533,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,456
900721,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,517
1727912,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,414
539691,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,529


# Calculating Probability of Default Based on Credit Scores

In [23]:
pd_df = pd.DataFrame(index=y_test.index, columns=['loan_status:Not_Default', 'credit_score', 'Prob_Not_Default', 'Prob_Default'])

pd_df['loan_status:Not_Default'] = y_test['loan_status:Not_Default']
pd_df['credit_score'] = test_score_df['credit_scores']
pd_df['Prob_Not_Default'], pd_df['Prob_Default'] = log_reg_model.predict_proba(X_test)[:, 1], log_reg_model.predict_proba(X_test)[:, 0]
pd_df['Pred_Not_Default'] = np.where(pd_df['Prob_Not_Default']>=0.5, 1, 0)

In [24]:
pd_df.head()

Unnamed: 0_level_0,loan_status:Not_Default,credit_score,Prob_Not_Default,Prob_Default,Pred_Not_Default
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1758049,1,477,0.870604,0.129396,1
686533,1,456,0.822034,0.177966,1
900721,1,517,0.933809,0.066191,1
1727912,1,414,0.681009,0.318991,1
539691,1,529,0.945514,0.054486,1


# Exporting Scorecard

In [25]:
scorecard_df[['Feature', 'Scores_int']].to_csv('scorecard.csv', index=False)
pd_df.to_csv('pd_df.csv', index=True)