# Lending Club Preprocessing / Training Part 1

In [16]:
import numpy as np
import pandas as pd
import pickle
import datetime

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings(action = 'ignore')

In [9]:
with open('lending_club_missingbals_pick.pkl', 'rb') as pickle_file:
    data = pickle.load(pickle_file)

In [10]:
data.shape

(2260668, 109)

Setting up correct promotion types for StandardScaler. Not doing this will output an Invalid Promotion Type error for Date Time formats.

Considering dropping time columns, purpose of the model is to predict likely for loan default based on features, we might want this to be time invariant. Time maybe spot trends over years.

In [49]:
for col in data.dtypes[data.dtypes == '<M8[ns]'].index:
    data[col] = (data.loc[:, col] - pd.to_datetime('1970-01-01')).astype('int64') / 1e9 / 60 / 60 / 24

In [50]:
data.reset_index(drop = True, inplace = True)

In [77]:
def preprocess_inputs(df):
    df = df.copy()
    
    #label encode
    df = label_encode(df)
    
    #Binary Encode
    df = dummy_encoding(df)
    
    #split into x and y
    y = df['TARGET']
    X = df.drop('TARGET', axis = 1)
    
    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 1)
    
    #Scale X
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = pd.DataFrame(sc.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(sc.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    
    return X_train, X_test, y_train, y_test

# Examining Class Imbalance

In [78]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [79]:
y_train.value_counts()

0    1808505
1         29
Name: TARGET, dtype: int64

In [80]:
fig = px.pie(
    y_train.value_counts(),
    values = 'TARGET',
    names = ['non_default', 'default'],
    title = 'Class Distribution',
    width = 500
)
fig.show()

# Training a Model (Imbalanced Classes)

In [81]:
def evaluate_model(model, X_test, y_test):
    
    acc = model.score(X_test, y_test)
    print("Accuracy: {:.2f}%".format(acc * 100))
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred, labels=['non_default', 'default'])
    clr = classification_report(y_test, y_pred, labels=['non_default', 'default'])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(ticks=[0.5, 1.5], labels=['non_default', 'default'])
    plt.yticks(ticks=[0.5, 1.5], labels=['non_default', 'default'])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [82]:
model = LogisticRegression()
model.fit(X_train, y_train)

evaluate_model(model, X_test, y_test)

Accuracy: 100.00%


ValueError: At least one label specified must be in y_true

# Label Encoding and One Hot Encoding:

Label Encoding will give categories an arbitrary ordering so it only makes sense with categories that have 2 unique values for example, yes and no. With categorical values that have 2 or more unique values one hot encoding is the better option. 

The question here is whether or not I am going to split into train/test, or just look at training accuracy. If I'm planning on splitting, I want to do so before imputing the data, to avoid leaking any info from the test set into the model. But since this data has already be imputed it might be interesting to compare the results.

In [40]:
#Label Encoding
def label_encode(df):
    count = 0

    for col in df:
        if df[col].dtype == 'object':
            if len(list(df[col].unique())) <= 2:     
                le = preprocessing.LabelEncoder()
                df[col] = le.fit_transform(df[col])
                count += 1

    return df

In [59]:
#One Hot Encoding
def dummy_encoding(df):
    df = pd.get_dummies(df)
    
    return df

# Conducting Train Test Split:

Here I am normalizing or standarizing the data so it has a mean of 0 and a standard deviation of 1

In [56]:
def scaling_data(df):
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = pd.DataFrame(sc.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(sc.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return df

# Oversampling the Training Set using SMOTE:

In [15]:
from imblearn.over_sampling import SMOTE

In [16]:
sm = SMOTE(random_state=12, sampling_strategy = 1.0)
x_train_r, y_train_r = sm.fit_resample(X_train, y_train)

In [17]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C = 0.0001,random_state=21)

log_reg.fit(x_train_r, y_train_r)

LogisticRegression(C=0.0001, random_state=21)

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix