# Fitting Logistic Regression

In [1]:
import os
os.chdir("/home/jacob/Project/LendingClub")

In [2]:
import Data_Helper as DH
import lendingclub
import prediction

import numpy as np
import pandas as pd
import json
import time
import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

pd.options.mode.chained_assignment = None  # default='warn'
%load_ext autoreload
%autoreload 2

## Instantiate Objects

In [3]:
# initialize config object
config = lendingclub.ConfigData("config_data.ini")

# initialize lendingclup api object
lc = lendingclub.LendingClub(config)

# initialize data transformer
transformer = DH.Transformer_full()

# initialize DataHelper
periodStart = ("Q1", "2016")
periodEnd = ("Q1", "2018")
DataHelper = DH.DataHelper(periodStart, periodEnd, transformer, lc)

In [4]:
# set training data
DataHelper.set_training_dataset()

In [5]:
training = DataHelper.training
training.shape

(680943, 100)

In [6]:
# set test data
DataHelper.set_test_dataset(["LoanStats_2018Q2.csv", "LoanStats_2018Q3.csv"])

## Logistic Regression - choose penalization type / solver

Based on the test run, l1/saga, l2/saga, and l2sag seem to be appropriat choices. We will try to finetune C parameter for these three pairs.

In [8]:
# logstic regression model
logistic_model = prediction.ModelLogistic()

In [9]:
# get training/test for this model
training, test = logistic_model.get_data_for_model(DataHelper.training, DataHelper.test)

In [10]:
# Check time - l1/saga
start = time.time()

logistic_model.solver = "saga"
logistic_model.penalty = "l1"

kwargs = {"C":1.0}
model = logistic_model.fit_model(training, **kwargs)
score = logistic_model.test_model(test, model)
print("Raw test score:",score)

end = time.time()
print(end - start)

Raw test score: {'score': 0.8058677065356291, 'AUC': 0.7061951970870266}
410.2265691757202


In [11]:
# Check time - l2/sag
start = time.time()

logistic_model.solver = "sag"
logistic_model.penalty = "l2"

kwargs = {"C":1.0}
model = logistic_model.fit_model(training, **kwargs)
score = logistic_model.test_model(test, model)
print("Raw test score:",score)

end = time.time()
print(end - start)

Raw test score: {'score': 0.8058677065356291, 'AUC': 0.7062939588273264}
274.58112502098083


In [12]:
# Check time - l2/lbfgs
start = time.time()

logistic_model.solver = "lbfgs"
logistic_model.penalty = "l2"
logistic_model.max_iter = 500

kwargs = {"C":1.0}
model = logistic_model.fit_model(training, **kwargs)
score = logistic_model.test_model(test, model)
print("Raw test score:",score)

end = time.time()
print(end - start)

Raw test score: {'score': 0.805828184488914, 'AUC': 0.7064164772644088}
76.13113212585449


In [13]:
# Check time - l2/saga
start = time.time()

logistic_model.solver = "saga"
logistic_model.penalty = "l2"
logistic_model.max_iter = 500

kwargs = {"C":1.0}
model = logistic_model.fit_model(training, **kwargs)
score = logistic_model.test_model(test, model)
print("Raw test score:",score)

end = time.time()
print(end - start)

Raw test score: {'score': 0.8058677065356291, 'AUC': 0.7061991049854892}
322.20109605789185


## Logistic Regression - CV

In [14]:
# get CV index from DataHelper
CVs = DataHelper.get_cross_validation_data(fold=5)

We first try l1 penalization with saga solver. Max_iteration is set to be 500.

In [15]:
logistic_model = prediction.ModelLogistic()
logistic_model.solver = "saga"
logistic_model.penalty = "l1"
logistic_model.max_iter = 500

In [16]:
# choose grids to fit
grids = [{"C": x} for x in [10000, 1000, 100, 10, 1, 0.1, 0.01, 0.001]]

In [17]:
scores = logistic_model.tune_parameters(DataHelper.training, CVs, grids, verbose=True)

fitting for {'C': 10000}
{'score': 0.8505272855574721, 'AUC': 0.6997169466561135}
{'score': 0.848457136114225, 'AUC': 0.6986528688813792}
{'score': 0.8529067076353394, 'AUC': 0.6976936435353015}
{'score': 0.8550506392653615, 'AUC': 0.6981745091581182}
{'score': 0.855354443626216, 'AUC': 0.7086484355153021}
fitting for {'C': 1000}
{'score': 0.8505272855574721, 'AUC': 0.6997169579836028}
{'score': 0.848457136114225, 'AUC': 0.6986528741456954}
{'score': 0.8529067076353394, 'AUC': 0.697693661289522}
{'score': 0.8550506392653615, 'AUC': 0.6981744279931283}
{'score': 0.855354443626216, 'AUC': 0.7086485888114603}
fitting for {'C': 100}
{'score': 0.8505272855574721, 'AUC': 0.699716917204641}
{'score': 0.848457136114225, 'AUC': 0.6986528952029611}
{'score': 0.8529067076353394, 'AUC': 0.6976935833253362}
{'score': 0.8550506392653615, 'AUC': 0.6981744373583194}
{'score': 0.855354443626216, 'AUC': 0.7086482837369278}
fitting for {'C': 10}
{'score': 0.8505272855574721, 'AUC': 0.6997170546448456}
{'

In [18]:
# find best param
_, avg_score, param = logistic_model.best_grid(scores, "AUC")
print(avg_score)
print(param)

0.7005773020446817
{'C': 1000}


For the next, we try l2 penalization with saga solver. Max_iteration is set to be 500.

In [19]:
logistic_model = prediction.ModelLogistic()
logistic_model.solver = "saga"
logistic_model.penalty = "l2"
logistic_model.max_iter = 500

In [20]:
# choose grids to fit
grids = [{"C": x} for x in [10000, 1000, 100, 10, 1, 0.1, 0.01, 0.001]]

In [21]:
scores = logistic_model.tune_parameters(DataHelper.training, CVs, grids, verbose=True)

fitting for {'C': 10000}
{'score': 0.8505272855574721, 'AUC': 0.6997168907738324}
{'score': 0.848457136114225, 'AUC': 0.6986528418077519}
{'score': 0.8529067076353394, 'AUC': 0.697693655114141}
{'score': 0.8550506392653615, 'AUC': 0.6981744615517298}
{'score': 0.855354443626216, 'AUC': 0.7086485425190561}
fitting for {'C': 1000}
{'score': 0.8505272855574721, 'AUC': 0.6997169798834156}
{'score': 0.848457136114225, 'AUC': 0.6986528688813791}
{'score': 0.8529067076353394, 'AUC': 0.6976936666929805}
{'score': 0.8550506392653615, 'AUC': 0.6981745762753214}
{'score': 0.855354443626216, 'AUC': 0.7086485356890292}
fitting for {'C': 100}
{'score': 0.8505272855574721, 'AUC': 0.6997169398596199}
{'score': 0.848457136114225, 'AUC': 0.6986528583527464}
{'score': 0.8529067076353394, 'AUC': 0.6976936142022414}
{'score': 0.8550506392653615, 'AUC': 0.6981743842889028}
{'score': 0.855354443626216, 'AUC': 0.7086484468986802}
fitting for {'C': 10}
{'score': 0.8505272855574721, 'AUC': 0.6997167601301215}
{

In [22]:
# find best param
_, avg_score, param = logistic_model.best_grid(scores, "AUC")
print(avg_score)
print(param)

0.7005773254844252
{'C': 1000}


We first try l2 penalization with sag solver. Max_iteration is set to be 500.

In [23]:
logistic_model = prediction.ModelLogistic()
logistic_model.solver = "sag"
logistic_model.penalty = "l2"
logistic_model.max_iter = 500

In [24]:
# choose grids to fit
grids = [{"C": x} for x in [10000, 1000, 100, 10, 1, 0.1, 0.01, 0.001]]

In [25]:
scores = logistic_model.tune_parameters(DataHelper.training, CVs, grids, verbose=True)

fitting for {'C': 10000}
{'score': 0.8505566880978517, 'AUC': 0.6997592291533516}
{'score': 0.8483981355832203, 'AUC': 0.6986510993190274}
{'score': 0.8529657970671946, 'AUC': 0.6977580002690829}
{'score': 0.8549718998828751, 'AUC': 0.6982329089292492}
{'score': 0.8552769272508818, 'AUC': 0.7087643122330414}
fitting for {'C': 1000}
{'score': 0.8505566880978517, 'AUC': 0.6997592321740154}
{'score': 0.8483981355832203, 'AUC': 0.6986510729974454}
{'score': 0.8529657970671946, 'AUC': 0.6977579879183209}
{'score': 0.8549718998828751, 'AUC': 0.6982328714684846}
{'score': 0.8552769272508818, 'AUC': 0.7087641634902343}
fitting for {'C': 100}
{'score': 0.8505566880978517, 'AUC': 0.6997591944157175}
{'score': 0.8483981355832203, 'AUC': 0.6986510451717731}
{'score': 0.8529657970671946, 'AUC': 0.6977577856745909}
{'score': 0.8549718998828751, 'AUC': 0.6982328464946416}
{'score': 0.8552769272508818, 'AUC': 0.7087639335459971}
fitting for {'C': 10}
{'score': 0.8505664889446448, 'AUC': 0.699759045648

In [26]:
# find best param
_, avg_score, param = logistic_model.best_grid(scores, "AUC")
print(avg_score)
print(param)

0.7006331099807506
{'C': 10000}


## Fit Best Model & Save

In [27]:
# fit best model
logistic_model = prediction.ModelLogistic()
logistic_model.solver = "saga"
logistic_model.penalty = "l1"
logistic_model.max_iter = 500

In [28]:
training, test = logistic_model.get_data_for_model(DataHelper.training, DataHelper.test)

In [29]:
kwargs = {"C":1000}
model = logistic_model.fit_model(training, **kwargs)
score = logistic_model.test_model(test, model)

In [30]:
score

{'score': 0.8058677065356291, 'AUC': 0.7062003909889552}

In [31]:
scaler = logistic_model.get_scaler(DataHelper.training)

In [32]:
# save model
filename = "logistic_regression_20201221.sav"

In [33]:
logistic_model.save_model(model,scaler,training,filename)

Model is saved on logistic_regression_20201221.sav


## Predict listed loan by saved model

In [7]:
# predict by calling model
filename = "logistic_regression_20201221.sav"

In [8]:
# instantiate from beginning
new_model = prediction.ModelLogistic(filename)

In [9]:
# update from the saved model
new_model.set_model_from_file()

Model is loaded from logistic_regression_20201221.sav


In [10]:
# check model description
new_model.model_description

'Logistic Regression Object, time: 12/21/2020, C: 1000, penalty: l1, solver: saga, max_iter: 500, tol: 0.0005'

In [11]:
# get listed loan
loans = DataHelper.get_listed_loandata()
loans.shape

(19, 99)

In [12]:
# prediction
loans, pred = new_model.predict_model(loans)

In [13]:
loans.shape

(18, 138)

In [14]:
pred

array([0.29608723, 0.08897611, 0.0356467 , 0.08745126, 0.04432517,
       0.13924438, 0.09713601, 0.12520976, 0.05125108, 0.03922336,
       0.05508913, 0.06509638, 0.12344735, 0.30743374, 0.17082775,
       0.0729315 , 0.12040713, 0.19446536])