## Import and Data preprocessing

In [1]:
#dependancies 
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

In [2]:
#loadind data
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,57107,57107,13375.0,0.1797,483.34,MORTGAGE,223000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,577150.0,122018.0,32000.0,170200.0,N,N
1,141451,141451,21000.0,0.1308,478.68,MORTGAGE,123000.0,Source Verified,low_risk,n,...,85.0,33.3,0.0,0.0,132750.0,27896.0,15900.0,35398.0,N,N
2,321143,321143,20000.0,0.124,448.95,MORTGAGE,197000.0,Source Verified,low_risk,n,...,85.7,33.3,0.0,0.0,628160.0,114043.0,22600.0,90340.0,N,N


In [3]:
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))
test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [4]:
# drop Unamed columns
train_df.drop(['Unnamed: 0'], axis= 1, inplace = True)
del train_df['index']

##

test_df.drop(['Unnamed: 0'], axis= 1, inplace = True)
del test_df['index']

In [5]:
# identify the missing data
train_df.isna().sum()

loan_amnt                     0
int_rate                      0
installment                   0
home_ownership                0
annual_inc                    0
                             ..
total_bal_ex_mort             0
total_bc_limit                0
total_il_high_credit_limit    0
hardship_flag                 0
debt_settlement_flag          0
Length: 84, dtype: int64

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   loan_status                 12180 non-null  object 
 7   pymnt_plan                  12180 non-null  object 
 8   dti                         12180 non-null  float64
 9   delinq_2yrs                 12180 non-null  float64
 10  inq_last_6mths              12180 non-null  float64
 11  open_acc                    12180 non-null  float64
 12  pub_rec                     12180 non-null  float64
 13  revol_bal                   121

In [7]:
# identify the missing data
test_df.isna().sum()

loan_amnt                     0
int_rate                      0
installment                   0
home_ownership                0
annual_inc                    0
                             ..
total_bal_ex_mort             0
total_bc_limit                0
total_il_high_credit_limit    0
hardship_flag                 0
debt_settlement_flag          0
Length: 84, dtype: int64

In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   4702 non-null   float64
 1   int_rate                    4702 non-null   float64
 2   installment                 4702 non-null   float64
 3   home_ownership              4702 non-null   object 
 4   annual_inc                  4702 non-null   float64
 5   verification_status         4702 non-null   object 
 6   loan_status                 4702 non-null   object 
 7   pymnt_plan                  4702 non-null   object 
 8   dti                         4702 non-null   float64
 9   delinq_2yrs                 4702 non-null   float64
 10  inq_last_6mths              4702 non-null   float64
 11  open_acc                    4702 non-null   float64
 12  pub_rec                     4702 non-null   float64
 13  revol_bal                   4702 

In [9]:
train_df["loan_status"].unique()

array(['low_risk', 'high_risk'], dtype=object)

In [10]:
# 
train_df["loan_status"].value_counts()

low_risk     6090
high_risk    6090
Name: loan_status, dtype: int64

In [11]:
# identify the variable to convert to numerical variable
obj_df = test_df.select_dtypes(include=['object']).copy()
obj_df.head()


Unnamed: 0,home_ownership,verification_status,loan_status,pymnt_plan,initial_list_status,application_type,hardship_flag,debt_settlement_flag
0,MORTGAGE,Not Verified,low_risk,n,w,Individual,N,N
1,RENT,Not Verified,low_risk,n,w,Individual,N,N
2,RENT,Not Verified,low_risk,n,w,Individual,N,N
3,RENT,Not Verified,low_risk,n,w,Individual,N,N
4,RENT,Not Verified,low_risk,n,w,Individual,N,N


In [12]:
# Convert categorical data to numeric and separate target feature for training data

# defining our features and label
X = train_df.drop(['loan_status'], axis =1)
y = LabelEncoder().fit_transform(train_df['loan_status'])

# convert the string to numeric variable
# X_train = pd.get_dummies(X)
y_train = y
# X_train

X_train= pd.get_dummies(X, columns=['initial_list_status',
                                   'home_ownership',
                                   'application_type',
                                   'verification_status',
                                   'pymnt_plan', 
                                   'hardship_flag', 
                                   'debt_settlement_flag'], drop_first=True)

In [13]:
# Convert categorical data to numeric and separate target feature for testing data
# defining our features and label
X = test_df.drop(['loan_status'], axis =1)
y = LabelEncoder().fit_transform(test_df['loan_status'])

# convert the string to numeric variable

y_test = y

X_test= pd.get_dummies(X, columns=['initial_list_status',
                                   'home_ownership',
                                   'application_type',
                                   'verification_status',
                                   'pymnt_plan', 
                                   'hardship_flag', 
                                   'debt_settlement_flag'], drop_first=True)



In [14]:
X_test.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_bc_limit,total_il_high_credit_limit,initial_list_status_w,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,application_type_Joint App,verification_status_Source Verified,verification_status_Verified,hardship_flag_Y
count,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,...,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0
mean,16983.889834,0.14749,503.512114,86772.17,22.984243,0.205444,0.701616,12.709911,0.10336,18558.165036,...,30585.559336,55616.824543,0.93088,0.453211,0.113356,0.432369,0.114207,0.364738,0.113356,0.017227
std,10122.727799,0.057906,293.240156,58022.77,21.162972,0.634326,0.901247,6.046306,0.309313,22488.773061,...,27664.330803,49447.248867,0.253684,0.497859,0.317061,0.495458,0.318096,0.481408,0.317061,0.130129
min,1000.0,0.0646,31.14,100.0,0.71,0.0,0.0,2.0,0.0,0.0,...,500.0,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9512.5,0.1033,280.84,52000.0,14.92,0.0,0.0,8.0,0.0,6592.75,...,11900.0,23372.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15000.0,0.143,443.96,73166.5,21.1,0.0,0.0,12.0,0.0,12802.0,...,22600.0,42801.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,24000.0,0.1862,684.72,105000.0,27.98,0.0,1.0,16.0,0.0,22429.5,...,40400.0,73172.5,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
max,40000.0,0.288,1604.18,1092000.0,999.0,8.0,5.0,53.0,2.0,512728.0,...,280400.0,838661.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
# add missing dummy variables to testing set
feature_difference = set(X_train) - set(X_test)

feature_difference_df = pd.DataFrame(data=np.zeros((X_test.shape[0], len(feature_difference))),
                                     columns=list(feature_difference))
X_test = X_test.join(feature_difference_df)


X_test

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_il_high_credit_limit,initial_list_status_w,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,application_type_Joint App,verification_status_Source Verified,verification_status_Verified,hardship_flag_Y,debt_settlement_flag_Y
0,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,99475.0,1,1,0,0,0,0,0,0,0.0
1,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,23628.0,1,0,0,1,0,0,0,0,0.0
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,15000.0,1,0,0,1,0,0,0,0,0.0
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,35981.0,1,0,0,1,0,0,0,0,0.0
4,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,24977.0,1,0,0,1,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,107388.0,0,0,0,1,0,1,0,0,0.0
4698,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,459.0,...,30775.0,1,0,0,1,0,0,0,0,0.0
4699,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,29550.0,0,0,0,1,0,0,1,0,0.0
4700,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,9657.0,1,0,0,1,0,1,0,0,0.0


In [16]:
X_test.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_il_high_credit_limit,initial_list_status_w,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,application_type_Joint App,verification_status_Source Verified,verification_status_Verified,hardship_flag_Y,debt_settlement_flag_Y
count,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,...,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0
mean,16983.889834,0.14749,503.512114,86772.17,22.984243,0.205444,0.701616,12.709911,0.10336,18558.165036,...,55616.824543,0.93088,0.453211,0.113356,0.432369,0.114207,0.364738,0.113356,0.017227,0.0
std,10122.727799,0.057906,293.240156,58022.77,21.162972,0.634326,0.901247,6.046306,0.309313,22488.773061,...,49447.248867,0.253684,0.497859,0.317061,0.495458,0.318096,0.481408,0.317061,0.130129,0.0
min,1000.0,0.0646,31.14,100.0,0.71,0.0,0.0,2.0,0.0,0.0,...,526.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9512.5,0.1033,280.84,52000.0,14.92,0.0,0.0,8.0,0.0,6592.75,...,23372.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15000.0,0.143,443.96,73166.5,21.1,0.0,0.0,12.0,0.0,12802.0,...,42801.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,24000.0,0.1862,684.72,105000.0,27.98,0.0,1.0,16.0,0.0,22429.5,...,73172.5,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
max,40000.0,0.288,1604.18,1092000.0,999.0,8.0,5.0,53.0,2.0,512728.0,...,838661.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


## My prediction
In my opinion the model that will predict best will be the logistic regression, becasue the outputs we are going to predict is a categorical variable.

## Data Modeling

In [17]:
# Train the Logistic Regression model on the unscaled data and print the model score
model = LogisticRegression(random_state =1, solver = 'lbfgs', max_iter=100)
classifier = model.fit(X_train, y_train)
print(f'Unscaled LogisticRegression Training Score:{classifier.score(X_train, y_train)}')
print(f'Unscaled LogisticRegression Testing Score:{classifier.score(X_test, y_test)}')

Unscaled LogisticRegression Training Score:0.6497536945812807
Unscaled LogisticRegression Testing Score:0.5157379838366652


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
#predit the 2020 data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,1,0
4698,1,0
4699,1,0
4700,1,0


In [19]:
# confusion matrix
y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[ 702, 1649],
       [ 628, 1723]], dtype=int64)

In [20]:
# accuracy of the model
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.30      0.38      2351
           1       0.51      0.73      0.60      2351

    accuracy                           0.52      4702
   macro avg       0.52      0.52      0.49      4702
weighted avg       0.52      0.52      0.49      4702



In [21]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'Unscaled RandomForestClassifier Training Score: {clf.score(X_train, y_train)}')
print(f'Unscaled RandomForestClassifier Testing Score: {clf.score(X_test, y_test)}')

Unscaled RandomForestClassifier Training Score: 1.0
Unscaled RandomForestClassifier Testing Score: 0.6307954062101233


In [22]:
predictions = clf.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,0,0
4698,1,0
4699,0,0
4700,0,0


In [23]:
# confusion matrix
y_true = y_test
y_pred = predictions
confusion_matrix(y_true, y_pred)

array([[1866,  485],
       [1251, 1100]], dtype=int64)

In [24]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.79      0.68      2351
           1       0.69      0.47      0.56      2351

    accuracy                           0.63      4702
   macro avg       0.65      0.63      0.62      4702
weighted avg       0.65      0.63      0.62      4702



In [25]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
# Train the Logistic Regression model on the scaled data and print the model score
model = LogisticRegression(random_state =1, solver = 'lbfgs', max_iter=100)
classifier = model.fit(X_train_scaled, y_train)
print(f'LogisticRegression Training Score:{classifier.score(X_train_scaled, y_train)}')
print(f'LogisticRegression Testing Score:{classifier.score(X_test_scaled, y_test)}')

LogisticRegression Training Score:0.7083743842364532
LogisticRegression Testing Score:0.7681837515950659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# predict the 2020 wit scaling data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})



Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
4697,1,0
4698,1,0
4699,0,0
4700,0,0


In [28]:
# confusion matrix
y_true = y_test
y_pred = predictions
confusion_matrix(y_true, y_pred)

array([[1606,  745],
       [1328, 1023]], dtype=int64)

In [29]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.68      0.61      2351
           1       0.58      0.44      0.50      2351

    accuracy                           0.56      4702
   macro avg       0.56      0.56      0.55      4702
weighted avg       0.56      0.56      0.55      4702



In [30]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}')

RandomForestClassifier Training Score: 1.0
RandomForestClassifier Testing Score: 0.6312207571246278


In [31]:
# predict 2020 
predictions = clf.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})



Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,1,0
4698,1,0
4699,1,0
4700,1,0


In [32]:
# confusion matrix
y_true = y_test
y_pred = predictions
confusion_matrix(y_true, y_pred)

array([[ 720, 1631],
       [ 852, 1499]], dtype=int64)

In [33]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.31      0.37      2351
           1       0.48      0.64      0.55      2351

    accuracy                           0.47      4702
   macro avg       0.47      0.47      0.46      4702
weighted avg       0.47      0.47      0.46      4702



## conclusion

Unscaled LogisticRegression Testing Score:0.5161633347511697


Unscaled RandomForestClassifier Testing Score: 0.6307954062101233

Scaled LogisticRegression Testing Score: 0.7681837515950659


Scaled RandomForestClassifier Testing Score: 0.6312207571246278


I was suprised to see the RandomForestClassifier performed better than LogisticRegression for the first time. With the scaled data the LogisticRegression performed better than RandomForestClassifier. The RandomForestClassifier score is stable for the two kind of data (unscaled and scaled). 


But because of the the accuracy of the Unscaled RandomForestClassifier(0.63) is higher than the LogisticRegression one, so I Think the RandomForestClassifier with the unscaled data is one I should keep. 


In [36]:
# Saving model
pickle.dump(clf, open('Model/model.pkl','wb'))

In [None]:
credit_df = pd.DataFrame({"rooms": [2],
                        "distance" : [3.5],
                        "bathroom" : [1],	
                        "car" : [1],
                        "type" : [0],
                        "region" : [0]}) 
                       	
# Create the index
credit_df= ['0']
  
# Set the index
credit_df.index = index_