<a href="https://colab.research.google.com/github/zoramardjoko/pgss2021/blob/main/ContinuationLogisticRegressionInPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connect google drive to this VM


In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Read our data file on the British seatbelt study



In [None]:
import pandas as pd

df = pd.read_csv("/content/gdrive/MyDrive/PGSS2021/L2-792021/britishSeatBeltStudy.csv")
df.head()

Unnamed: 0,DriversKilled,drivers,front,rear,kms,PetrolPrice,VanKilled,law,Date
0,107,1687,867,269,9059,0.102972,12,0,1969-01-01
1,97,1508,825,265,7685,0.102363,6,0,1969-02-01
2,102,1507,806,319,9963,0.102062,12,0,1969-03-01
3,87,1385,814,407,10955,0.100873,8,0,1969-04-01
4,119,1632,991,454,11823,0.10102,10,0,1969-05-01


# Adjust the column types here and check the data frame for information on column-specific types

In [None]:
df["law"] = df["law"].astype("category")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   DriversKilled  192 non-null    int64   
 1   drivers        192 non-null    int64   
 2   front          192 non-null    int64   
 3   rear           192 non-null    int64   
 4   kms            192 non-null    int64   
 5   PetrolPrice    192 non-null    float64 
 6   VanKilled      192 non-null    int64   
 7   law            192 non-null    category
 8   Date           192 non-null    object  
dtypes: category(1), float64(1), int64(6), object(1)
memory usage: 12.4+ KB


# Load SKLearn's linear model for logistic regression and fit it against our dataset, identifying a separate variable, X for regressors, and y for the response variable (binary in this case)

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn import metrics

X=df[['DriversKilled', 'VanKilled']]  # Features
y=df['law'].astype('category')  # Labels

myModel = LogisticRegression()
myModel.fit(X,y)


0

# Evaluate the fitted model on some test examples

In [None]:
y_pred = myModel.predict([[100, 10]])[0]
print("input: " + "[100, 10]")
print("output: " + str(y_pred))

y_pred = myModel.predict([[10, 1]])[0]
print("input: " + "[10, 1]")
print("output: " + str(y_pred))

input: [100, 10]
output: 0
input: [10, 1]
output: 1


# Test the model's ability to predict a bunch of records at once, by passing in our initial training matrix, X, into the model.  Then prepare a Confusion Matrix to compare prediction counts against actual counts for each class of law=1 or law=0

In [None]:
y_pred = myModel.predict(X)
pd.crosstab(y, y_pred)

col_0,0,1
law,Unnamed: 1_level_1,Unnamed: 2_level_1
0,166,3
1,16,7


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix_0 = confusion_matrix (y, y_pred)
confusion_matrix_0

array([[166,   3],
       [ 16,   7]])

# Lets create some statistics around the group / batch prediction accuracies using the default threshold for probability of law=1 (which is 0.5 by default)

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y, y_pred)

print(cr)

              precision    recall  f1-score   support

           0       0.91      0.98      0.95       169
           1       0.70      0.30      0.42        23

    accuracy                           0.90       192
   macro avg       0.81      0.64      0.69       192
weighted avg       0.89      0.90      0.88       192



# Optimize the model-specific probability threshold to improve the model prediction stats

In [None]:
# Fine tune the model with a better probability cut-off to get better results in the training set

from sklearn.metrics import classification_report
y_pred = myModel.predict_proba(X)[:,1]

cr = classification_report(y, y_pred>0.15)
print(confusion_matrix (y, y_pred>0.15))
print(cr)

[[137  32]
 [  5  18]]
              precision    recall  f1-score   support

           0       0.96      0.81      0.88       169
           1       0.36      0.78      0.49        23

    accuracy                           0.81       192
   macro avg       0.66      0.80      0.69       192
weighted avg       0.89      0.81      0.83       192



In [None]:
# Fine tune the model with a better probability cut-off to get better results in the training set [This time lets optimize using the log odds]

from sklearn.metrics import classification_report
y_pred = myModel.predict_log_proba(X)[:,1]

cr = classification_report(y, y_pred>-2)
print(confusion_matrix (y, y_pred>-2))
print(cr)

[[135  34]
 [  5  18]]
              precision    recall  f1-score   support

           0       0.96      0.80      0.87       169
           1       0.35      0.78      0.48        23

    accuracy                           0.80       192
   macro avg       0.66      0.79      0.68       192
weighted avg       0.89      0.80      0.83       192



# Let's save the model that we prepared above, myModel, to a neutral binary file that can live and operate by itself

In [None]:
import pickle
model_filename = 'british-Seatbelt-Study-model.pkl'
pickle.dump(myModel, open(model_filename,'wb'))


In [None]:
reloadedModel = pickle.load(open('british-Seatbelt-Study-model.pkl','rb'))
print(reloadedModel.predict([[100, 10]]))

[0]


In [None]:
print(reloadedModel.predict([[10, 1]]))

[1]
