In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
# Additional imports:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [6]:
# Convert categorical data to numeric and separate target feature for training data
train_df=pd.get_dummies(train_df)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 94 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            12180 non-null  float64
 1   int_rate                             12180 non-null  float64
 2   installment                          12180 non-null  float64
 3   annual_inc                           12180 non-null  float64
 4   dti                                  12180 non-null  float64
 5   delinq_2yrs                          12180 non-null  float64
 6   inq_last_6mths                       12180 non-null  float64
 7   open_acc                             12180 non-null  float64
 8   pub_rec                              12180 non-null  float64
 9   revol_bal                            12180 non-null  float64
 10  total_acc                            12180 non-null  float64
 11  out_prncp                   

In [7]:
# Convert categorical data to numeric and separate target feature for testing data
test_df=pd.get_dummies(test_df)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 93 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            4702 non-null   float64
 1   int_rate                             4702 non-null   float64
 2   installment                          4702 non-null   float64
 3   annual_inc                           4702 non-null   float64
 4   dti                                  4702 non-null   float64
 5   delinq_2yrs                          4702 non-null   float64
 6   inq_last_6mths                       4702 non-null   float64
 7   open_acc                             4702 non-null   float64
 8   pub_rec                              4702 non-null   float64
 9   revol_bal                            4702 non-null   float64
 10  total_acc                            4702 non-null   float64
 11  out_prncp                     

In [8]:
# add missing dummy variables to testing set
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,1,0,0,1,1,0,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,1,0,1,1,0,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,1,0,1,1,0,0,1,1,0,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,0,1,1,0,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,1,0,1,0,1,0,1


In [9]:
#Missing Variable
test_df[test_df['debt_settlement_flag_N']==0]  

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,target_high_risk,target_low_risk


In [10]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [11]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop(['target'], axis=1)
X_train = pd.get_dummies(X_train)
y_train = LabelEncoder().fit_transform(train_df['target'])
y_train

array([1, 1, 1, ..., 0, 0, 0])

In [12]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop(['target'], axis=1)
X_test = pd.get_dummies(X_test)
y_test = LabelEncoder().fit_transform(test_df['target'])
y_test

array([1, 1, 1, ..., 0, 0, 0])

In [13]:
# add missing dummy variables to testing set
X_test['debt_settlement_flag_Y']=0
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,0,1,1,0,0,1,1,0
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,0,1,1,0,1,0,1,0
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,0,1,1,0,0,1,1,0
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,1,1,0,1,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,0,1,0,1,1,0,1,0,1,0


In [14]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train):.3f}")
print(f"Testing Data Score: {classifier.score(X_test, y_test):.3f}")

Training Data Score: 0.706
Testing Data Score: 0.559


In [15]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test, y_test):.3f}')

Training Score: 1.000
Testing Score: 0.621


In [16]:
# Scale the data
scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train):.3f}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test):.3f}")

Training Data Score: 0.711
Testing Data Score: 0.760


In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test):.3f}')