<a href="https://colab.research.google.com/github/wardspan/Oreilly_Stuff/blob/master/O'Reilly_Class_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Import Python Libraries
import numpy as np
import pandas as pd
from datetime import datetime

import pandas_datareader.data as pdr

import matplotlib.pyplot as plt
plt.style.use('seaborn')

  from pandas.util.testing import assert_frame_equal


In [0]:
#Import data
start = datetime(1982, 1, 1)
end = datetime(2020, 2, 29)

recession = pdr.DataReader('USREC', 'fred', start, end) #NBER business cycle classification
yield_curve = pdr.DataReader('T10Y3MM', 'fred', start, end) #Difference between the 3 month and 10 year treasury yields
unemployment = pdr.DataReader('UNRATE', 'fred', start, end) #Unemployment rate
industrial_capacity = pdr.DataReader('TCU', 'fred', start, end) #Total industrial capacity utilization

In [0]:
#Create target dataframe
target = recession[1:]
target.tail()

Unnamed: 0_level_0,USREC
DATE,Unnamed: 1_level_1
2019-10-01,0
2019-11-01,0
2019-12-01,0
2020-01-01,0
2020-02-01,0


In [0]:
#Create features dataframe
features = pd.DataFrame()
features['curve'] = yield_curve['T10Y3MM'].diff()
features['unemployment'] = unemployment['UNRATE'].diff()
features['industrial'] = industrial_capacity['TCU'].diff()
features = features.dropna()
features.tail()

Unnamed: 0_level_0,curve,unemployment,industrial
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-01,0.26,0.1,-0.4455
2019-11-01,0.21,-0.1,0.5929
2019-12-01,0.05,0.0,-0.4387
2020-01-01,-0.08,0.1,-0.4437
2020-02-01,-0.25,-0.1,0.2603


In [0]:
#Create logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

target = np.ravel(target)

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

classifier = LogisticRegression(random_state = 0)
model = classifier.fit(features_standardized, target)
model.coef_

array([[ 0.60074403,  1.56151543, -1.12040406]])

In [0]:
#Use new data to predict recession
new = [[0.61, 0.9, -4.2351]]
model.predict(new)

array([1])

In [0]:
#Quantify probability of recession
model.predict_proba(new)


array([[0.06288945, 0.93711055]])

In [0]:
#Regularize logistic regression model with C hyperparameter
regularized_classifier = LogisticRegression(penalty='l2', C=0.001, random_state = 0)
regularized_model = classifier.fit(features_standardized, target)
regularized_model.coef_

array([[ 0.60074403,  1.56151543, -1.12040406]])

# Model Evaluation

In [0]:
#Split dataset into train and test subsets. Test size is 25% of the total dataset
from sklearn.model_selection import train_test_split

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size = 0.25, random_state=0)
classifier.fit(features_train, target_train)
print("Training score:", classifier.score(features_train, target_train))
print("Testing score:", classifier.score(features_test, target_test))

Training score: 0.9181286549707602
Testing score: 0.9304347826086956


In [0]:
#Use K-fold cross validation (default folds = 5, default scoring metric = accuracy) 
from sklearn.model_selection import cross_val_score

cross_val_score(classifier, features_standardized, target)

array([0.93478261, 0.88043478, 0.91208791, 0.91208791, 0.98901099])

In [0]:
#Use accuracy = (TP + TN)/(TP+TN+FP+FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "accuracy")

array([0.93478261, 0.88043478, 0.91208791, 0.91208791, 0.98901099])

In [0]:
#Use precision = TP/(TP+FP) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "precision")

array([0.8       , 0.25      , 0.66666667, 0.6       , 1.        ])

In [0]:
#Use Sensitivity = TP/(TP + FN) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "recall")

array([0.44444444, 0.11111111, 0.22222222, 0.33333333, 0.875     ])

In [0]:
#Use F measure = 2*(precision*recall/(precision+recall)) as the evaluation metric
cross_val_score(classifier, features_standardized, target, scoring = "f1")

array([0.57142857, 0.15384615, 0.33333333, 0.42857143, 0.93333333])