In [1]:
# PS2 - CE264
# GSI: Mustapha Harb - Mengqiao Yu
# Good Reference for this homework: 
# https://github.com/timothyb0912/pylogit/blob/master/examples/notebooks/Main%20PyLogit%20Example.ipynb

# importing the requried libraries
from collections import OrderedDict    # For recording the model specification 

import pandas as pd                    # For file input/output
import numpy as np                     # For vectorized math operations

import pylogit as pl                   # For MNL model estimation and
                                       # conversion from wide to long format
import warnings
warnings.filterwarnings("ignore")

In [2]:
# reading the data file 
data_path = '../data/raw/Air_Travel_Survey.csv'
data_01 = pd.read_csv(data_path, sep=",")

In [3]:
#look at the columns and the data
data_01.columns

Index([u'personID', u'gender', u'age', u'purpose', u'income', u'classTicket',
       u'payment', u'AA_FFP', u'CO_FFP', u'DL_FFP', u'B6_FFP', u'WN_FFP',
       u'UA_FFP', u'US_FFP', u'a1aircraft', u'a1departMAM', u'a1connections',
       u'a1travtime', u'a1arriveMAM', u'a1timediff', u'a1performance',
       u'a1fare', u'a1airline', u'a2aircraft', u'a2departMAM',
       u'a2connections', u'a2travtime', u'a2arriveMAM', u'a2timediff',
       u'a2performance', u'a2fare', u'a2airline', u'a1_AV', u'a2_AV',
       u'choice', u'choiceSituationID'],
      dtype='object')

In [4]:
data_01.head(20)

Unnamed: 0,personID,gender,age,purpose,income,classTicket,payment,AA_FFP,CO_FFP,DL_FFP,...,a2travtime,a2arriveMAM,a2timediff,a2performance,a2fare,a2airline,a1_AV,a2_AV,choice,choiceSituationID
0,1,1,5,1,10,3,1,1,1,1,...,895,1020,120,60,450,2,1,1,1,1
1,1,1,5,1,10,3,1,1,1,1,...,985,900,0,90,1050,7,1,1,1,2
2,1,1,5,1,10,3,1,1,1,1,...,980,1020,120,60,600,2,1,1,2,3
3,1,1,5,1,10,3,1,1,1,1,...,960,960,60,80,600,6,1,1,1,4
4,1,1,5,1,10,3,1,1,1,1,...,1130,900,0,90,900,7,1,1,2,5
5,1,1,5,1,10,3,1,1,1,1,...,1155,960,60,80,450,6,1,1,1,6
6,1,1,5,1,10,3,1,1,1,1,...,810,840,-60,70,900,8,1,1,1,7
7,1,1,5,1,10,3,1,1,1,1,...,1065,840,-60,70,1050,8,1,1,1,8
8,2,2,6,3,6,1,1,1,1,1,...,200,1010,60,60,600,3,1,1,1,9
9,2,2,6,3,6,1,1,1,1,1,...,275,1070,120,80,700,4,1,1,2,10


## Overview for binomial logit in python

### Step 0: Load the data
### Step 1: Define necessary variables and convert the data to long format.
### Step 2: Variable creations and transformations
### Step 3: Model specification
### Step 4: Run the model and analyze the results

## Step 1: Define necessary variables and convert the data to long format.
We need to specify five elements to construct a long format dataset in order to run the model under PyLogit.

(1.1) Individual related variables: the columns in the dataset that are specific to a given individual, regardless of what alternative is being considered. (e.g. gender)

(1.2) Alternative related variables (e.g. travel time).

(1.3) Altervative availabilities.

(1.4) Alternative and observation ids.

(1.5) The choice column.

In [5]:
# (1.1) 
# Create the list of individual specific variables
ind_variables = data_01.columns.tolist()[:14]
print("ind_variables are:\n{}".format(ind_variables))

ind_variables are:
['personID', 'gender', 'age', 'purpose', 'income', 'classTicket', 'payment', 'AA_FFP', 'CO_FFP', 'DL_FFP', 'B6_FFP', 'WN_FFP', 'UA_FFP', 'US_FFP']


In [6]:
# (1.2)
# Specify the variables that vary across individuals and some or all alternatives
# The keys are the column names that will be used in the long format dataframe.
# The values are dictionaries whose key-value pairs are the alternative id and
# the column name of the corresponding column that encodes that variable for
# the given alternative.

# {key1: value1, key2: value2}

alt_varying_variables = {u'aircraft_type': dict([(1, 'a1aircraft'),
                                                 (2, 'a2aircraft')]),
                          u'departure_time': dict([(1, 'a1departMAM'),
                                                   (2, 'a2departMAM')]),
                          u'connections': dict([(1, 'a1connections'),
                                                (2, 'a2connections')]),
                          u'travel_time': dict([(1, 'a1travtime'),
                                                (2, 'a2travtime')]),                         
                          u'arrival_time': dict([(1, 'a1arriveMAM'),
                                                 (2, 'a2arriveMAM')]),       
                          u'time_diff': dict([(1, 'a1timediff'),
                                              (2, 'a2timediff')]), 
                          u'performance': dict([(1, 'a1performance'),
                                                (2, 'a2performance')]), 
                          u'fare': dict([(1, 'a1fare'),
                                         (2, 'a2fare')]), 
                          u'airline': dict([(1, 'a1airline'),
                                            (2, 'a2airline')])}


In [7]:
# (1.3) 
# Specify the availability variables
# Note that the keys of the dictionary are the alternative id's.
# The values are the columns denoting the availability for the
# given alternative in the dataset.
availability_variables = {1: 'a1_AV',
                          2: 'a2_AV'}

In [8]:
# (1.4)
# Identify the alternative associated with each row.
custom_alt_id = "alternative_id"

# Create a custom id column that ignores the fact that this is a 
# panel/repeated-observations dataset. 
obs_id_column = "choiceSituationID"

In [9]:
# (1.5) 
# Create a variable recording the choice column
choice_column = "choice"

In [10]:
# Perform the conversion to long-format
data_long = pl.convert_wide_to_long(data_01, 
                                    ind_variables, 
                                    alt_varying_variables, 
                                    availability_variables, 
                                    obs_id_column, 
                                    choice_column,
                                    new_alt_id_name=custom_alt_id)
# Look at the resulting long-format dataframe
data_long.head(5).T

Unnamed: 0,0,1,2,3,4
choiceSituationID,1,1,2,2,3
alternative_id,1,2,1,2,1
choice,1,0,1,0,0
personID,1,1,1,1,1
gender,1,1,1,1,1
age,5,5,5,5,5
purpose,1,1,1,1,1
income,10,10,10,10,10
classTicket,3,3,3,3,3
payment,1,1,1,1,1


## Step 2: Variable creations and transformations

In [11]:
# Create scaled variables so the estimated coefficients are of similar magnitudes
# Scale the travel time column by 60 to convert raw units (minutes) to hours
data_long["travel_time_hrs"] = data_long["travel_time"] / 60.0

# Scale the fare column by 100 to convert raw units ($) to 100$
data_long["fare_100$"] = data_long["fare"] / 100.0
# data_long["interation_term"] = data_long["gender"] * data_long["legroom"]
# Create dummy variables
data_long["fare_over500$"] = (data_long["fare_100$"] > 500).astype(int)

## Step 3: Model specification

In [12]:
# specifying the utility equations

# NOTE: - Specification and variable names must be ordered dictionaries.
#       - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         of integers or or lists of lists of integers. Within a list, 
#         or within the inner-most list, the integers should be the 
#         alternative ID's of the alternative whose utility specification 
#         the explanatory variable is entering. Lists of lists denote 
#         alternatives that will share a common coefficient for the variable
#         in question.

basic_specification = OrderedDict()
basic_names = OrderedDict()

# Case A: alternative specific
basic_specification["travel_time_hrs"] = [1, 2]
basic_names["travel_time_hrs"] = ['Travel Time, units:hrs Alternative 1',
                                  'Travel Time, units:hrs Alternative 2']

# Case B: generic: hw2
# basic_specification["travel_time_hrs"] = [[1, 2]]
# basic_names["travel_time_hrs"] = ['Travel Time, units:hrs']

# Case C: only for one
# basic_specification["travel_time_hrs"] = [1]
# basic_names["travel_time_hrs"] = ['Travel Time, units:hrs Alternative 1']

basic_specification["fare_100$"] = [1, 2]
basic_names["fare_100$"] = ['Fare, units:hundredth Alternative 1',
                                'Fare, units:hundredth Alternative 2']

#basic_specification["intercept"] = [1, 2]
# basic_names["intercept"] = ['ASC Alternative 1',
#                            'ASC Alternative 2']

## Now! Let's estimate the model and show the results

In [13]:
# Estimate the binary logit model (
air_travel_logit =\
    pl.create_choice_model(data=data_long,
                           alt_id_col=custom_alt_id,
                           obs_id_col=obs_id_column,
                           choice_col=choice_column,
                           specification=basic_specification,
                           model_type="MNL",
                           names=basic_names)

In [14]:
# Specify the initial values and method for the optimization.
# 4 being the total number of parameters to be estimated
air_travel_logit.fit_mle(np.zeros(4))

Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time for Point Estimation: 0.04 seconds.
Final log-likelihood: -4,057.1481


In [15]:
# Look at the estimation results
air_travel_logit.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7020.0
Method:,MLE,Df Model:,4.0
Date:,"Mon, 09 Mar 2020",Pseudo R-squ.:,0.167
Time:,20:13:51,Pseudo R-bar-squ.:,0.166
AIC:,8122.296,Log-Likelihood:,-4057.148
BIC:,8149.724,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
"Travel Time, units:hrs Alternative 1",-0.5512,0.021,-26.079,0.000,-0.593,-0.510
"Travel Time, units:hrs Alternative 2",-0.5423,0.021,-25.982,0.000,-0.583,-0.501
"Fare, units:hundredth Alternative 1",-0.4692,0.021,-22.773,0.000,-0.510,-0.429
"Fare, units:hundredth Alternative 2",-0.5247,0.022,-24.112,0.000,-0.567,-0.482


In [16]:
air_travel_logit.cov

Unnamed: 0,"Travel Time, units:hrs Alternative 1","Travel Time, units:hrs Alternative 2","Fare, units:hundredth Alternative 1","Fare, units:hundredth Alternative 2"
"Travel Time, units:hrs Alternative 1",0.000447,0.000424,8.2e-05,0.000113
"Travel Time, units:hrs Alternative 2",0.000424,0.000436,9.5e-05,8.4e-05
"Fare, units:hundredth Alternative 1",8.2e-05,9.5e-05,0.000425,0.000399
"Fare, units:hundredth Alternative 2",0.000113,8.4e-05,0.000399,0.000474


In [17]:
dir(air_travel_logit)

['__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_mixing_variable_names_to_individual_vars',
 '_adjust_inferential_results_for_parameter_constraints',
 '_check_result_dict_for_needed_keys',
 '_create_fit_summary',
 '_create_results_summary',
 '_record_values_for_fit_summary_and_statsmodels',
 '_store_basic_estimation_results',
 '_store_generic_inference_results',
 '_store_inferential_results',
 '_store_optional_parameters',
 'aic',
 'alt_IDs',
 'alt_id_col',
 'bic',
 'bse',
 'check_param_list_validity',
 'chi_square',
 'choice_col',
 'choices',
 'coefs',
 'conf_int',
 'cov',
 'data',
 'design',
 'design_3d',
 'df_model',
 'df_resid',
 'estimation_message',
 'estimation_success',
 'fisher_information',
 'fit_mle',
 'fit_summary',
 'fitted_probs',
 'get_mappings_fo