In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import numpy as np

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.shape 
## 12180 number of rows/samples (for our case it is number of people)
## 86 is the number of columns (caracteristics/features/attributes)

(12180, 86)

### Preprocessing

#### Feature Selection

In [6]:
## Explore columns that are not relevant for the classification:
# - Constant, all the entries have the same value.
# - Columns that have unique values for every sample(row), eg. Name, ID, Index
for col in train_df.columns:
    if train_df[col].nunique()==train_df.shape[0] or train_df[col].nunique()==1:
        print(col,train_df[col].nunique())

Unnamed: 0 12180
index 12180
pymnt_plan 1
recoveries 1
collection_recovery_fee 1
policy_code 1
num_tl_120dpd_2m 1
tax_liens 1


In [7]:
for col in test_df.columns:
    if test_df[col].nunique()==test_df.shape[0] or test_df[col].nunique()==1:
        print(col,test_df[col].nunique())

Unnamed: 0 4702
index 4702
pymnt_plan 1
policy_code 1
tax_liens 1
debt_settlement_flag 1


In [9]:
## Assign feature matrix and target variable
X_train=train_df.drop(['Unnamed: 0','index','pymnt_plan','policy_code','tax_liens','loan_status'],axis=1) # feature matrix
y_train=train_df['loan_status'] # target variable (variable that we want to predict)

In [10]:
X_test=test_df.drop(['Unnamed: 0','index','pymnt_plan','policy_code','tax_liens','loan_status'],axis=1) 
y_test=test_df['loan_status']

In [11]:
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,dti,delinq_2yrs,inq_last_6mths,open_acc,...,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,19.75,0.0,1.0,18.0,...,2.0,97.7,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,11.52,2.0,0.0,8.0,...,1.0,66.7,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,6.74,0.0,0.0,6.0,...,1.0,100.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,12.13,0.0,2.0,7.0,...,4.0,100.0,50.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,3600.0,0.124,120.27,RENT,50000.0,Not Verified,16.08,0.0,3.0,6.0,...,3.0,100.0,25.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


#### Transform categorical to numerical

In [13]:
X_train_dummies=pd.get_dummies(X_train)
X_test_dummies=pd.get_dummies(X_test)

In [14]:
X_train_dummies.shape,X_test_dummies.shape

((12180, 89), (4702, 88))

In [15]:
## fix number of variables for both Train and Test

In [16]:
# Get missing columns in the training test
missing_cols = set( X_train_dummies.columns ) - set( X_test_dummies.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test_dummies[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test_dummies = X_test_dummies[X_train_dummies.columns]

In [17]:
X_train_dummies.shape,X_test_dummies.shape

((12180, 89), (4702, 89))

In [18]:
X_train_dummies.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,39728.0,...,0,0,0,1,1,0,1,0,1,0
1,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,9585.0,...,1,0,0,1,1,0,1,0,1,0
2,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,16708.0,...,1,0,0,1,1,0,1,0,1,0
3,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,8809.0,...,0,0,0,1,1,0,1,0,1,0
4,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,65420.0,...,1,0,0,1,1,0,1,0,1,0


#### Feature Scaling: 


In [19]:
# Transform all the columns to the same scale


In [20]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [21]:
scaler = StandardScaler().fit(X_train_dummies)
# Convert categorical data to numeric and separate target feature for train data
X_train_scaled = pd.DataFrame(scaler.transform(X_train_dummies),columns=X_train_dummies.columns)
# Convert categorical data to numeric and separate target feature for testing data
X_test_scaled=pd.DataFrame(scaler.transform(X_test_dummies),columns=X_test_dummies.columns)

In [22]:
# X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label, random_state=1)

In [23]:
X_train_scaled.shape,X_test_scaled.shape

((12180, 89), (4702, 89))

### Logistic Regression on unscaled data

In [24]:
# Train the Logistic Regression model on the unscaled data and print the model score

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
model1=LogisticRegression() ## initialize model
model1.fit(X_train_dummies,y_train) ## train model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [27]:
print(f"Testing Data Score: {model1.score(X_test_dummies, y_test)}")

Testing Data Score: 0.5163760102084219


### Random Forest on unscaled

In [28]:
# Train a Random Forest Classifier model and print the model score

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
model2=RandomForestClassifier(random_state=0) ## initialize model
model2.fit(X_train_dummies,y_train) ## train model

RandomForestClassifier(random_state=0)

In [31]:
print(f"Testing Data Score: {model2.score(X_test_dummies, y_test)}")

Testing Data Score: 0.6380263717566993


Which model performed better? How does that compare to your prediction? Write down your results and thoughts.

- The model that performed better with unscaled data is Random Forest
- Logistic Regression score: 0.516, Random Forest score: 0.638


### Logistic Regression on scaled data

In [32]:
# Train the Logistic Regression model on the scaled data and print the model score

In [33]:
model3=LogisticRegression() ## initialize model
model3.fit(X_train_scaled,y_train) ## train model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [34]:
print(f"Testing Data Score: {model3.score(X_test_scaled, y_test)}")

Testing Data Score: 0.767333049766057


### Random Forest on scaled

In [35]:
# Train a Random Forest Classifier model on the scaled data and print the model score

In [36]:
model4=RandomForestClassifier(random_state=0) ## initialize model
model4.fit(X_train_scaled,y_train) ## train model

RandomForestClassifier(random_state=0)

In [37]:
print(f"Testing Data Score: {model4.score(X_test_scaled, y_test)}")

Testing Data Score: 0.6367503190131859


How do the model scores compare to each other, and to the previous results on unscaled data? How does this compare to your prediction? Write down your results and thoughts.

- The model that performed better with scaled data is Logistic Regression
- Logistic Regression score: 0.767, Random Forest score: 0.636
- With Random Forest, we get similar score with scaled and unscaled data, meaning that scaling does not affect the model's performance.
- With Logistic Regression, we see a great improvement on the model's performance: from 0.516 to 0.767, meaning that scaling the data helps the model improve its performance.