In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

### Logistic Regression
The dataset:



In [4]:
data = pd.read_csv('shopper.csv')

Now, I will preprocess data to turn all categorical and binary variables into numerical dummy variables. 

In [5]:
### non-numericals converted to dummy variables
revenue = pd.get_dummies(data['Revenue'],drop_first=True)
weekend = pd.get_dummies(data['Weekend'], drop_first = True)
visitor_type = pd.get_dummies(data['VisitorType'])
month = pd.get_dummies(data['Month'], drop_first = True)

### drop-outs, column-name changes and 'concat's

#drop-outs
data.drop( ['Revenue', 'Weekend', 'VisitorType', 'Month'], axis = 1, inplace = True)
visitor_type.drop( ['Other'], axis = 1, inplace=True)

# column-name changes
revenue.rename(columns={True:'Revenue'},inplace=True)
weekend.rename(columns={True:'Weekend'}, inplace=True)

# concat's
data = pd.concat([data, revenue], axis=1)
data = pd.concat( [data, weekend], axis=1)
data = pd.concat( [data, visitor_type], axis=1)
data = pd.concat( [data, month], axis=1)

Dividing the dataset into training and validation sets with 60%-40% ratio

In [10]:
x = data.drop('Revenue',axis=1)
y = data['Revenue']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

Training a logistic regression classifier on the training dataset using scikit-learn.

In [11]:
logmodel = LogisticRegression(max_iter=20000, solver='sag')
logmodel.fit(x_train,y_train) 

LogisticRegression(max_iter=20000, solver='sag')

Now, generating predictions from the classifier for the validation dataset, and its predictive performance in terms of `sensitivity` and `specificity`.

In [12]:
y_pred = logmodel.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("")
print("------------------------")
print("Sensitivity:", tp/(tp+fn))
print("------------------------")
print("Specificity:", tn/(tn+fp))
print("------------------------")
print("True Positives (tp):",tp)
print("False Positives (fp):",fp)
print("False Negatives(fn):",fn)
print("True Negatives (tn):",tn)
print("------------------------")
print("classification_report:")
print("")
print(classification_report(y_test, y_pred))


------------------------
Sensitivity: 0.4076015727391874
------------------------
Specificity: 0.9712161189733749
------------------------
True Positives (tp): 311
False Positives (fp): 120
False Negatives(fn): 452
True Negatives (tn): 4049
------------------------
classification_report:

              precision    recall  f1-score   support

           0       0.90      0.97      0.93      4169
           1       0.72      0.41      0.52       763

    accuracy                           0.88      4932
   macro avg       0.81      0.69      0.73      4932
weighted avg       0.87      0.88      0.87      4932



In [16]:
from sklearn.metrics import confusion_matrix
predictions = logmodel.predict(x_test)

def summary(y, yhat):
    tn, fp, fn, tp = confusion_matrix(y, yhat).ravel()
    print("Sensitivity: ",tp/(tp + fn))
    print("Specificity: ",tn / (tn + fp))

summary(y_test, predictions)


Sensitivity:  0.4076015727391874
Specificity:  0.9712161189733749


Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

rfmodel = RandomForestClassifier()
rfmodel.fit(x_train, y_train)

rfpred = rfmodel.predict(x_test)
summary(y_test,rfpred),

Sensitivity:  0.5897771952817824
Specificity:  0.957543775485728


(None,)

K Neighbors Classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = 3
knnmodel = KNeighborsClassifier(n_neighbors)
knnmodel.fit(x_train,y_train)
knnpred = knnmodel.predict(x_test)
summary(y_test,knnpred)

Sensitivity:  0.34338138925294887
Specificity:  0.9405133125449748
