# Test LL SSH - Pt2

In [1]:
import pandas as pd
import os
import numpy as np
import statsmodels.discrete.discrete_model as smd
from statsmodels.tools import add_constant
import statsmodels.formula.api as smf

**Import data and format**

In [2]:
# import data
df_raw = pd.read_csv("CyclingPerceptionData-Assignment 1-2024.csv")
df_raw.head()

Unnamed: 0,ID,sex,age,OwnBikes,NumberOfBikes,HH_Adults_15+,HH_Children_Dummy,NoOf15yrsOlderBikers,BikingForAllReason,BikingForWork,...,Time from MajorBikePath_min,Time from Neasrest BikeLane_Min,Neighbourhood ID,Neighbourhood-Area_SQKM,Neighbourhood-bikeLength_KM,Neighbourhood_Population,Rank_Overall-Quality-Of-Biking,Rank_On-Street-BikeLanes-Quality,Rank_OffStreet-BikeLanes-Quality,Rank_Bike-Parking-Quality
0,40612,0,43,1,2,2,1,1,1,0,...,15,15,124,3.227141,0.763362,22387,3,3,4,5
1,40472,1,29,1,2,4,1,2,1,0,...,5,10,8,2.917358,3.234139,9437,3,1,3,3
2,40897,1,33,1,4,4,1,4,3,0,...,0,0,77,5.960673,1.037788,14830,3,4,4,5
3,40594,1,34,1,3,2,2,2,2,2,...,2,0,43,1.436087,4.510762,13777,4,3,3,4
4,41451,1,24,1,3,5,1,3,1,1,...,25,10,131,37.609169,33.950589,56281,3,4,3,4


In [18]:
# some data processing - largely guesswork
df_proc = df_raw.copy()

# set all "unknown" to 0 ("no") and convert to binary (1=yes, 0=no/unknown)
# assume bikingfor[reason] is meant to be also following the 1=yes, 2=no, 3=unknown scheme
for col in ['AccessToMotorVehicle_1Yes-2No-3Unknown', 'BikeParkingAt Work_1Yes2No3Unknown']:
    df_proc[col] = (df_proc[col] == 1).astype(int)

# assume HH_Children_Dummy is meant to be 1=yes, 2=no
df_proc['HH_Children_Dummy'] = df_proc['HH_Children_Dummy'] % 2

# re-code 'Home_1Rent2Own' to "rent home" (binary)
df_proc['Home_1Rent2Own'] = df_proc['Home_1Rent2Own'] % 2

In [4]:
# list all provided vars
print('\n'.join(x for x in df_raw.columns if not x.startswith("Rank_")))

ID
sex
age
OwnBikes
NumberOfBikes
HH_Adults_15+
HH_Children_Dummy
NoOf15yrsOlderBikers
BikingForAllReason
BikingForWork
BikingForSchool
BikingForErrndFriends
Num of Recreational-Biker
AccessToMotorVehicle_1Yes-2No-3Unknown
BikeParkingAt Work_1Yes2No3Unknown
How longLivedInNH
Home_1Rent2Own
d3: Home Type
d5: LevelOfEducation
d6: Job Stastus
d8: Income
Time from MajorBikePath_min
Time from Neasrest BikeLane_Min
Neighbourhood ID
Neighbourhood-Area_SQKM
Neighbourhood-bikeLength_KM
Neighbourhood_Population


## Part 1. Binary Probit Model
A dataset: “CyclingPerceptionData-Assignment 1-2024,” is uploaded. 

Use column AB of sheet “Final”, which is of “Rank_Overall-Quality-Of-Biking”. 

Please consider this variable as categorical data. 

Please model the ranking over 2 as a binary Probit model. 

Please select the explanatory variables from this data sheet as relevant you think

For estimation, you can use any software. The report should include at least the following sections:
* Present the model formulations and justification of independent variables. 
* Selection of the best model. 
* Justification of using the explanatory variables 

The report should be professional, not more than 2 pages (singles-space with a 1.25-inch margin on all four sides).

**Prepare data**

In [5]:
# define Y-variable: "Rank_Overall-Quality-Of-Biking" (as categorical)
y_probit = df_raw['Rank_Overall-Quality-Of-Biking'].astype('category')
y_probit

0      3
1      3
2      3
3      4
4      3
      ..
238    5
239    5
240    2
241    3
242    3
Name: Rank_Overall-Quality-Of-Biking, Length: 243, dtype: category
Categories (6, int64): [1, 2, 3, 4, 5, 6]

In [6]:
# convert to binary: y > 2
y_probit_gt2 = (y_probit.astype('int') > 2).astype(int)
y_probit_gt2

0      1
1      1
2      1
3      1
4      1
      ..
238    1
239    1
240    0
241    1
242    1
Name: Rank_Overall-Quality-Of-Biking, Length: 243, dtype: int64

**Select relevant explanatory variables**

See above for list of all non-rank [non-response] features provided in the dataset.

In [20]:
all_vars = ['sex','age','NumberOfBikes','BikingForWork','BikingForErrndFriends',
            'AccessToMotorVehicle_1Yes-2No-3Unknown', 'Num of Recreational-Biker', 
            'Time from MajorBikePath_min', 'Time from Neasrest BikeLane_Min', 
            'Neighbourhood-bikeLength_KM'
           ]

**Try models**

In [30]:
probit_Nmod = smd.Probit(y_probit_gt2, np.ones(y_probit_gt2.shape))
probit_Nres = probit_Nmod.fit()
probit_null_aic = probit_Nres.aic
probit_null_aic
probit_Nres.summary()

Optimization terminated successfully.
         Current function value: 0.508286
         Iterations 5


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,242.0
Method:,MLE,Df Model:,0.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,1.084e-10
Time:,15:15:21,Log-Likelihood:,-123.51
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8212,0.091,9.017,0.000,0.643,1.000


In [9]:
probit_vars = ['age', 'sex', 'BikingForWork', 'Num of Recreational-Biker', 
               'AccessToMotorVehicle_1Yes-2No-3Unknown', 
               'Neighbourhood-bikeLength_KM', 'Time from MajorBikePath_min']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_raw[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.485867
         Iterations 6
252.13 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,235.0
Method:,MLE,Df Model:,7.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04411
Time:,14:08:13,Log-Likelihood:,-118.07
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.1432

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.2650,0.466,2.713,0.007,0.351,2.179
age,-0.0100,0.007,-1.518,0.129,-0.023,0.003
sex,0.0381,0.192,0.198,0.843,-0.339,0.415
BikingForWork,0.3104,0.151,2.049,0.040,0.014,0.607
Num of Recreational-Biker,-0.1080,0.103,-1.050,0.294,-0.310,0.094
AccessToMotorVehicle_1Yes-2No-3Unknown,-0.1147,0.081,-1.410,0.159,-0.274,0.045
Neighbourhood-bikeLength_KM,0.0180,0.017,1.088,0.277,-0.014,0.050
Time from MajorBikePath_min,0.0130,0.012,1.057,0.290,-0.011,0.037


In [10]:
probit_vars = ['age', 'sex', 'BikingForWork', 'Num of Recreational-Biker', 
               'AccessToMotorVehicle_1Yes-2No-3Unknown', 
               'Neighbourhood-bikeLength_KM', 'Time from MajorBikePath_min']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_proc[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.488467
         Iterations 6
253.39 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,235.0
Method:,MLE,Df Model:,7.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.03899
Time:,14:08:13,Log-Likelihood:,-118.7
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.2104

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8865,0.401,2.209,0.027,0.100,1.673
age,-0.0093,0.007,-1.404,0.160,-0.022,0.004
sex,0.0351,0.192,0.183,0.855,-0.341,0.411
BikingForWork,0.2975,0.151,1.970,0.049,0.002,0.594
Num of Recreational-Biker,-0.0925,0.102,-0.909,0.364,-0.292,0.107
AccessToMotorVehicle_1Yes-2No-3Unknown,0.1644,0.194,0.849,0.396,-0.215,0.544
Neighbourhood-bikeLength_KM,0.0189,0.017,1.139,0.255,-0.014,0.051
Time from MajorBikePath_min,0.0129,0.012,1.046,0.296,-0.011,0.037


In [11]:
probit_vars = ['BikingForWork', 'NumberOfBikes', 'd8: Income', 'HH_Children_Dummy', 'age']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_raw[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.484317
         Iterations 5
247.38 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,237.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04716
Time:,14:08:13,Log-Likelihood:,-117.69
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.03993

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6681,0.568,2.935,0.003,0.554,2.782
BikingForWork,0.2709,0.155,1.747,0.081,-0.033,0.575
NumberOfBikes,-0.0419,0.092,-0.457,0.648,-0.222,0.138
d8: Income,0.0409,0.048,0.860,0.390,-0.052,0.134
HH_Children_Dummy,-0.4370,0.219,-2.000,0.046,-0.865,-0.009
age,-0.0079,0.007,-1.215,0.225,-0.021,0.005


In [12]:
probit_vars = ['BikingForWork', 'NumberOfBikes', 'HH_Children_Dummy', 'age', 'Neighbourhood-bikeLength_KM']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_proc[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.482918
         Iterations 6
246.70 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,237.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04991
Time:,14:08:13,Log-Likelihood:,-117.35
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.03055

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.7533,0.380,1.982,0.047,0.008,1.498
BikingForWork,0.2857,0.157,1.822,0.068,-0.022,0.593
NumberOfBikes,-0.0258,0.087,-0.296,0.767,-0.196,0.145
HH_Children_Dummy,0.4640,0.219,2.121,0.034,0.035,0.893
age,-0.0071,0.006,-1.100,0.271,-0.020,0.006
Neighbourhood-bikeLength_KM,0.0196,0.017,1.151,0.250,-0.014,0.053


In [13]:
probit_vars = ['BikingForAllReason', 'NumberOfBikes', 'HH_Children_Dummy', 'age', 'Neighbourhood-bikeLength_KM']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_proc[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

  x = pd.concat(x[::order], 1)


Optimization terminated successfully.
         Current function value: 0.486018
         Iterations 6
248.20 vs null 249.03 (lower is better)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,237.0
Method:,MLE,Df Model:,5.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04381
Time:,14:08:13,Log-Likelihood:,-118.1
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.05502

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6064,0.449,1.352,0.176,-0.273,1.486
BikingForAllReason,0.2415,0.177,1.368,0.171,-0.105,0.588
NumberOfBikes,-0.0245,0.090,-0.273,0.785,-0.201,0.151
HH_Children_Dummy,0.4548,0.219,2.077,0.038,0.026,0.884
age,-0.0079,0.006,-1.229,0.219,-0.021,0.005
Neighbourhood-bikeLength_KM,0.0223,0.017,1.286,0.198,-0.012,0.056


In [66]:
probit_vars = ['BikingForWork', 'HH_Children_Dummy', 'sex']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_proc[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.499054
         Iterations 5
248.54 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,240.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.01816
Time:,16:27:16,Log-Likelihood:,-121.27
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.1061

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.6336,0.148,4.290,0.000,0.344,0.923
BikingForWork,0.2960,0.146,2.032,0.042,0.010,0.582
sex,0.0360,0.184,0.195,0.845,-0.325,0.397


**Best model**

In [29]:
probit_vars = ['BikingForWork', 'HH_Children_Dummy']

probit_mod = smd.Probit(y_probit_gt2, 
                        add_constant(df_proc[probit_vars], has_constant='add')
                       )
probit_res = probit_mod.fit()

print(f"{probit_res.aic:.2f} vs null {probit_null_aic:.2f} (lower is better)")
probit_res.summary()

Optimization terminated successfully.
         Current function value: 0.488377
         Iterations 5
243.35 vs null 249.03 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_Overall-Quality-Of-Biking,No. Observations:,243.0
Model:,Probit,Df Residuals:,240.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.03917
Time:,14:59:10,Log-Likelihood:,-118.68
converged:,True,LL-Null:,-123.51
Covariance Type:,nonrobust,LLR p-value:,0.007923

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.4933,0.139,3.557,0.000,0.221,0.765
BikingForWork,0.2962,0.146,2.023,0.043,0.009,0.583
HH_Children_Dummy,0.4451,0.198,2.250,0.024,0.057,0.833


In [31]:
probit_res.llf, probit_res.llnull

(-118.6755338609211, -123.513574015086)

## Part 2. Binary Logit Model:
A dataset: “CyclingPerceptionData-Assignment 1-2024,” is uploaded. 

Use column AB of sheet “Final”, which is of “Rank_On-Street-BikeLanes-Quality”. 

Please consider this variable as categorical data. 

Please model the ranking over 1 as a binary Logit model. 

Please select the explanatory variables from this data sheet as relevant you think

For estimation, you can use any software. The report should include at least the following sections:
* Present the model formulations and justification of independent variables. 
* Selection of the best model. 
* Justification of using the explanatory variable

The report should be professional, not more than 2 pages (singles-space with a 1.25-inch margin on all four sides).

**Prepare data**

In [15]:
# define Y-variable: "Rank_On-Street-BikeLanes-Quality" (as categorical)
y_logit = df_raw['Rank_On-Street-BikeLanes-Quality'].astype('category')
y_logit

0      3
1      1
2      4
3      3
4      4
      ..
238    5
239    5
240    2
241    4
242    4
Name: Rank_On-Street-BikeLanes-Quality, Length: 243, dtype: category
Categories (6, int64): [1, 2, 3, 4, 5, 6]

In [16]:
# convert to binary: Y > 1
y_logit_gt1 = (y_logit.astype(int) > 1).astype(int)
y_logit_gt1

0      1
1      0
2      1
3      1
4      1
      ..
238    1
239    1
240    1
241    1
242    1
Name: Rank_On-Street-BikeLanes-Quality, Length: 243, dtype: int64

**Select relevant explanatory variables**

See above for list of all non-rank [non-response] features provided in the dataset.

**Try models**

In [33]:
logit_Nmod = smd.Logit(y_logit_gt1, np.ones(y_logit_gt1.shape))
logit_Nres = logit_Nmod.fit()
logit_null_aic = logit_Nres.aic
logit_null_aic
logit_Nres.summary()

Optimization terminated successfully.
         Current function value: 0.220349
         Iterations 7


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,242.0
Method:,MLE,Df Model:,0.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,6.419e-11
Time:,16:13:59,Log-Likelihood:,-53.545
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.7947,0.275,10.151,0.000,2.255,3.334


In [34]:
logit_vars = ['BikingForWork', 'HH_Children_Dummy']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.210724
         Iterations 8
108.41 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,240.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04368
Time:,16:14:47,Log-Likelihood:,-51.206
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.09644

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.2671,0.381,5.949,0.000,1.520,3.014
BikingForWork,1.0542,0.559,1.886,0.059,-0.041,2.150
HH_Children_Dummy,0.1395,0.579,0.241,0.810,-0.996,1.275


In [55]:
logit_vars = ['BikingForWork']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.210844
         Iterations 8
106.47 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,241.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04313
Time:,16:20:07,Log-Likelihood:,-51.235
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.03162

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.3172,0.324,7.147,0.000,1.682,2.953
BikingForWork,1.0613,0.559,1.898,0.058,-0.034,2.157


In [68]:
logit_vars = ['sex']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.215683
         Iterations 7
108.82 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,241.0
Method:,MLE,Df Model:,1.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.02118
Time:,16:31:12,Log-Likelihood:,-52.411
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.1321

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4596,0.329,7.465,0.000,1.814,3.105
sex,0.8726,0.606,1.439,0.150,-0.316,2.061


In [64]:
logit_vars = ['BikingForWork', 'sex', 'Neighbourhood-bikeLength_KM']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.205398
         Iterations 8
107.82 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,239.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.06785
Time:,16:22:50,Log-Likelihood:,-49.912
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.06388

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.7323,0.516,3.354,0.001,0.720,2.745
BikingForWork,1.1096,0.574,1.932,0.053,-0.016,2.235
sex,0.8845,0.615,1.438,0.150,-0.321,2.090
Neighbourhood-bikeLength_KM,0.0356,0.053,0.676,0.499,-0.068,0.139


In [51]:
logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[all_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.200298
         Iterations 8
119.34 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,232.0
Method:,MLE,Df Model:,10.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.091
Time:,16:18:56,Log-Likelihood:,-48.672
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.4632

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.2018,1.206,1.826,0.068,-0.162,4.565
sex,0.9526,0.644,1.479,0.139,-0.309,2.215
age,-0.0144,0.019,-0.763,0.445,-0.051,0.023
NumberOfBikes,0.0114,0.292,0.039,0.969,-0.561,0.583
BikingForWork,1.1225,0.598,1.877,0.060,-0.049,2.295
BikingForErrndFriends,0.0303,0.079,0.383,0.702,-0.125,0.186
AccessToMotorVehicle_1Yes-2No-3Unknown,0.2757,0.632,0.436,0.663,-0.963,1.514
Num of Recreational-Biker,-0.2868,0.337,-0.851,0.395,-0.947,0.374
Time from MajorBikePath_min,0.0464,0.051,0.910,0.363,-0.054,0.146


In [54]:
logit_vars = ['Time from Neasrest BikeLane_Min', 'Num of Recreational-Biker', 'BikingForWork']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.209753
         Iterations 8
109.94 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,239.0
Method:,MLE,Df Model:,3.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.04809
Time:,16:19:53,Log-Likelihood:,-50.97
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.1612

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.4344,0.634,3.840,0.000,1.192,3.677
Time from Neasrest BikeLane_Min,0.0122,0.025,0.489,0.625,-0.037,0.061
Num of Recreational-Biker,-0.1415,0.280,-0.506,0.613,-0.690,0.406
BikingForWork,1.0794,0.557,1.940,0.052,-0.011,2.170


**Best model**

In [57]:
logit_vars = ['BikingForWork', 'sex']

logit_mod = smd.Logit(y_logit_gt1, 
                      add_constant(df_proc[logit_vars], has_constant='add')
                     )
logit_res = logit_mod.fit()

print(f"{logit_res.aic:.2f} vs null {logit_null_aic:.2f} (lower is better)")
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.206443
         Iterations 8
106.33 vs null 109.09 (lower is better)


  x = pd.concat(x[::order], 1)


0,1,2,3
Dep. Variable:,Rank_On-Street-BikeLanes-Quality,No. Observations:,243.0
Model:,Logit,Df Residuals:,240.0
Method:,MLE,Df Model:,2.0
Date:,"Sat, 24 Feb 2024",Pseudo R-squ.:,0.06311
Time:,16:20:34,Log-Likelihood:,-50.166
converged:,True,LL-Null:,-53.545
Covariance Type:,nonrobust,LLR p-value:,0.03408

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.9839,0.378,5.249,0.000,1.243,2.725
BikingForWork,1.0844,0.572,1.895,0.058,-0.037,2.206
sex,0.8559,0.611,1.400,0.161,-0.342,2.054
