### Predicting Commuter Transportation Choices 

In [1]:
# Import packages into the workspace for this program
from __future__ import division,print_function
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
# read data from comma-delimited text file...create DataFrame object
sydney = pd.read_csv(r"C:\Users\User\Desktop\Datasetnexustech\Marketing Analytics\Datasets\sydney.csv")

In [3]:
# check input DataFrame
print(sydney)

     cartime  carcost  traintime  traincost choice
0         70       50         64         39  TRAIN
1         50      230         60         32  TRAIN
2         50       70         58         40    CAR
3         60      108         93         62    CAR
4         70       60         68         26  TRAIN
..       ...      ...        ...        ...    ...
328       27       50         52         40    CAR
329       25       25         63         80    CAR
330       50       50         80         50    CAR
331       25       25         39         20    CAR
332       35       64         95         40    CAR

[333 rows x 5 columns]


In [4]:
# dictionary object to convert string to binary integer of accurate prediction and other measures of classification performance.
response_to_binary = {'TRAIN':1, 'CAR':0}
y = sydney['choice'].map(response_to_binary)
cartime = sydney['cartime']
carcost = sydney['carcost']
traintime = sydney['traintime']
traincost = sydney['traincost']

In [5]:
# define design matrix for the linear predictor
Intercept = np.array([1] * len(y))
x = np.array([Intercept, cartime, carcost,
traintime, traincost]).T

In [6]:
# generalized linear model for logistic regression
logistic_regression = sm.GLM(y, x,family=sm.families.Binomial())
sydney_fit = logistic_regression.fit()
print(sydney_fit.summary())
sydney['train_prob'] = sydney_fit.predict(linear = False)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 choice   No. Observations:                  333
Model:                            GLM   Df Residuals:                      328
Model Family:                Binomial   Df Model:                            4
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -136.32
Date:                Tue, 30 Aug 2022   Deviance:                       272.63
Time:                        10:26:27   Pearson chi2:                     326.
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.4440      0.585     -2.468      0.0

In [8]:
# function to convert probability to choice prediction
def prob_to_response(response_prob, cutoff):
    if(response_prob > cutoff):
        return('TRAIN')
    else:
        return('CAR')


In [9]:
# add binary predictions to DataFrame sydney using cutoff value for the case
sydney['choice_pred'] = \
sydney['train_prob'].apply(lambda d:
prob_to_response(d, cutoff = 0.50))

In [26]:
# evaluate performance of logistic regression model
# obtain confusion matrix and proportion of observations correctly predicted
cmat = pd.crosstab(sydney['choice_pred'],sydney['choice'])
a = float(cmat.iloc[0,0])
b = float(cmat.iloc[0,1])
c = float(cmat.iloc[1,0])
d = float(cmat.iloc[1,1])
n = a + b + c + d
predictive_accuracy = (a + d)/n
print(cmat)
print('\n Percentage Correctly Predicted',\
      round(predictive_accuracy, 3), "\n")

choice       CAR  TRAIN
choice_pred            
CAR          155     30
TRAIN         28    120

 Percentage Correctly Predicted 0.826 

