In [7]:
# Eventuellement faire !pip install -r ../requirements.txt ou !pip install biogeme

**Model 0**


In [None]:
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from biogeme.expressions import Beta, Variable, log, exp
from biogeme.results_processing import get_pandas_estimated_parameters, html_output
from scipy.stats import norm

In [9]:
DATA_FOLDER = 'data/'

In [10]:
# Load the .dat file into a pandas DataFrame
data = pd.read_csv(DATA_FOLDER + 'lpmc09.dat', sep='\t')

In [11]:
data.head()

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent
0,13,1,1,1,4,3,1,5,0.0,1,...,0.241389,0.0,0.122222,0.0,0,0.132222,0.0,0.5,0.0,0.065126
1,43,10,0,0,3,3,6,1,1.0,1,...,0.072778,0.0,0.344722,0.120556,1,0.167778,3.0,0.44,0.0,0.145695
2,46,12,0,0,4,3,1,5,0.0,1,...,0.136389,0.0,0.070278,0.0,0,0.072222,0.0,0.24,0.0,0.107692
3,53,12,1,3,4,3,2,1,1.0,1,...,0.0825,0.0,0.061944,0.0,0,0.0625,1.5,0.17,0.0,0.124444
4,65,13,1,3,3,5,1,5,0.0,1,...,0.050833,0.216667,0.590556,0.237778,2,0.863889,0.0,2.6,0.0,0.675884


# **Part 1** - Model 0 [1.5 points]

#### Develop a model specification that includes alternative specific constant, cost and travel time for each alternative. Cost and travel time are associated with generic parameters. Present both the specification (i.e., the utility functions) and the estimation results (parameter values, t-tests or p-values, null and final log-likelihoods). [1 point]

In [12]:
# Create the Biogeme database
database = db.Database("lpmc09", data)

# Define the variables
travel_mode = Variable('travel_mode')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_access = Variable('dur_pt_access')
dur_pt_int = Variable('dur_pt_int')
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')
cost_driving_ccharge = Variable('cost_driving_ccharge')

In [13]:
# Create new variables
dur_pt_tot = dur_pt_bus + dur_pt_access + dur_pt_int
cost_drive = cost_driving_fuel + cost_driving_ccharge

In [14]:
# Define the ASC to be estimated
asc_pt = Beta('asc_pt', 0, None, None, 0)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

# Define the Betas to be estimated
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_time = Beta('beta_time', 0, None, None, 0)

In [15]:
# Define the utility functions
v_walking = dur_walking * beta_time
v_cycling = asc_cycling + dur_cycling * beta_time
v_pt = asc_pt + dur_pt_tot * beta_time + cost_transit * beta_cost
v_drive = asc_driving + dur_driving * beta_time + cost_drive * beta_cost

In [16]:
# Define the association between alternatives and utility functions
V = {1: v_walking,
     2: v_cycling,
     3: v_pt,
     4: v_drive}
logprob = models.loglogit(V, None, travel_mode)

In [None]:
# Initialisation of the Biogeme object
model_0 = bio.BIOGEME(database, logprob)
model_0.model_name = 'model_0'

In [18]:
results_model_0 = model_0.estimate()
print(results_model_0.print_general_statistics())

Number of estimated parameters             5
Sample size                                5000
Excluded observations                      0
Init log likelihood                        -4552.633
Final log likelihood                       -4552.633
Likelihood ratio test for the init. model  -0
Rho-square for the init. model             0
Rho-square-bar for the init. model         -0.0011
Akaike Information Criterion               9115.265
Bayesian Information Criterion             9147.851
Final gradient norm                        1.8474E-05
Bootstrapping time                         None


In [19]:
get_pandas_estimated_parameters(estimation_results=results_model_0)

Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,beta_time,-4.374888,0.164549,-26.587163,0.0
1,asc_cycling,-3.351221,0.099973,-33.521236,0.0
2,asc_pt,-0.641355,0.058471,-10.968706,0.0
3,beta_cost,-0.137783,0.013934,-9.888477,0.0
4,asc_driving,-0.735068,0.067722,-10.854174,0.0


In [20]:
print(beta_cost)

Beta('beta_cost', -0.1377827502054028, None, None, 0)


#### Comment on the estimation results (statistical significance and sign of all parameters). [0.5 point]


#### **Overall Model Fit**
- **Log-likelihood**: The initial log-likelihood is **-6931.472**, and the final log-likelihood is **-4552.633**. The improvement suggests the model fits the data better after estimation.
- **Likelihood Ratio Test**: The value is **4757.678**, which is high, indicating a significant improvement in model fit compared to the null model (no predictors).
- **Rho-square (Pseudo R²)**: Both the **Rho-square for the init. model (0.343)** and the **Rho-square-bar for the init. model (0.342)** suggest that about 34% of the variation in the dependent variable is explained by the model, which is a moderate level of explanatory power.
- **AIC and BIC**: The **Akaike Information Criterion (9115.265)** and **Bayesian Information Criterion (9147.851)** are provided for model comparison. Lower values indicate better fit, but without comparison to other models, their absolute values are less interpretable.

#### **Interpretation of Parameter Signs**
- **beta_time (-4.37)**: The negative sign indicates that as the time variable increases, the utility decreases. This is intuitive: people generally prefer options that take less time.
- **asc_cycling (-3.35)**: The negative sign for the alternative-specific constant (ASC) for cycling suggests that, all else being equal, individuals have a lower inherent preference for cycling compared to the base alternative.
- **asc_pt (-0.64)**: The negative ASC for public transport (PT) indicates a lower inherent preference for public transport compared to the base alternative.
- **beta_cost (-0.14)**: The negative sign for cost means that as the cost increases, the utility decreases. This is expected, as people prefer cheaper options.
- **asc_driving (-0.74)**: The negative ASC for driving suggests a lower inherent preference for driving compared to the base alternative.

#### **Statistical Significance**
- **Robust p-values**: All parameters have **p-values of 0.0**, which means they are **statistically significant** at any conventional level.
- **Robust t-statistics**: All t-statistics are far from zero (absolute values much greater than 2), further confirming the statistical significance of each parameter.

### **What is modified for every model**
- Model 1: Temps en alternate specific
- Model 2: Ajout de busscale & female
- Model 3: Log sur le prix --> log($β_{cost}$)

# **Part 2** - Model 1 [3 points]

# **Part 3** - Model 2 [4 points]

# **Part 4** - Model 3 [3 points]

#### Using $Model_{pref}$ as the base model, include an appropriate non-linear transformation of one of the variables. Present both the specification and the estimation results (as defined previously). [1 point]


In [21]:
data["cost_transit"].isna().sum()

np.int64(0)

In [22]:
# Definition of the prefferenced model
model_pref = model_0 # Basic prefferenced model before creation of model 1 & 2

In [23]:
# We have decided to make the cost variable non-linear by applying a logarithm to it in order to capture diminishing sensitivity to cost increases.
# This means that as the cost increases, the impact on the utility decreases at a decreasing rate.
# This transformation is appropriate when we believe that individuals are more sensitive to changes in cost at lower
# levels of cost compared to higher levels.

In [None]:
# Define variables

travel_mode = Variable('travel_mode')
dur_walking = Variable('dur_walking')
dur_cycling = Variable('dur_cycling')
dur_pt_bus = Variable('dur_pt_bus')
dur_pt_access = Variable('dur_pt_access')
dur_pt_int = Variable('dur_pt_int')
dur_driving = Variable('dur_driving')
cost_transit = Variable('cost_transit')
cost_driving_fuel = Variable('cost_driving_fuel')
cost_driving_ccharge = Variable('cost_driving_ccharge')

# Create new variables
dur_pt_tot = dur_pt_bus + dur_pt_access + dur_pt_int
cost_drive = cost_driving_fuel + cost_driving_ccharge

In [25]:
# Define the ASC to be estimated
asc_pt = Beta('asc_pt', 0, None, None, 0)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

# Define the Betas to be estimated
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_time = Beta('beta_time', 0, None, None, 0)

In [26]:
# Box-Cox transformation of costs
lambda_boxcox = Beta('lambda_boxcox', 1.01, -10, 10, 0) # Can't put 1 as it creates a log and it is impossible due to values equal to 0 in the cost variable
boxcox_cost_transit = models.boxcox(cost_transit, lambda_boxcox)
boxcox_cost_drive = models.boxcox(cost_drive, lambda_boxcox)

In [27]:
# Define the utility functions
v_walking = dur_walking * beta_time
v_cycling = asc_cycling + dur_cycling * beta_time
v_pt = asc_pt + dur_pt_tot * beta_time + boxcox_cost_transit * beta_cost
v_drive = asc_driving + dur_driving * beta_time + boxcox_cost_drive * beta_cost

# Define the association between alternatives and utility functions
V = {1: v_walking,
     2: v_cycling,
     3: v_pt,
     4: v_drive}

# Define the logit model
logprob = models.loglogit(V, None, travel_mode)

# Initialisation of the Biogeme object
model_3 = bio.BIOGEME(database, logprob)
model_3.model_name = 'model_3'

In [28]:
results_model_3 = model_3.estimate()
print(results_model_3.print_general_statistics())

Number of estimated parameters             6
Sample size                                5000
Excluded observations                      0
Init log likelihood                        -4553.795
Final log likelihood                       -4553.795
Likelihood ratio test for the init. model  -0
Rho-square for the init. model             0
Rho-square-bar for the init. model         -0.00132
Akaike Information Criterion               9119.59
Bayesian Information Criterion             9158.693
Final gradient norm                        5.0376E-04
Bootstrapping time                         None


In [29]:
get_pandas_estimated_parameters(estimation_results=results_model_3)

Unnamed: 0,Name,Value,Robust std err.,Robust t-stat.,Robust p-value
0,beta_time,-4.36786,0.170047,-25.686152,0.0
1,asc_cycling,-3.363543,0.104574,-32.16437,0.0
2,asc_pt,-0.724318,0.057494,-12.598104,0.0
3,lambda_boxcox,0.782756,0.238285,3.284954,0.00102
4,beta_cost,-0.194631,0.066427,-2.92998,0.00339
5,asc_driving,-0.913519,0.097858,-9.335151,0.0


#### State the underlying assumption of the non-linear specification defined in this situation. [0.5 point]

We have decided to make the cost variable non-linear by applying a logarithm to it in order to capture diminishing sensitivity to cost increases. This means that as the cost increases, the impact on the utility decreases at a decreasing rate. This transformation is appropriate when we believe that individuals are more sensitive to changes in cost at lower levels of cost compared to higher levels.

#### Comment on the estimation results (as defined previously). [0.5 point]

### **Overall Model Fit**
- **Log-likelihood**: The initial and final log-likelihood are both **-4553.795**, which is unusual because the final log-likelihood should improve (become less negative) after estimation. This suggests a potential issue with the estimation process or the model specification.
- **Likelihood Ratio Test**: The value is **0**, which is unexpected and indicates no improvement over the null model. This is likely due to the identical init and final log-likelihoods.
- **Rho-square**: Both values are **0**, indicating that the model does not explain any additional variation compared to the null model. This is unusual and suggests a problem with the model or data.
- **AIC and BIC**: The values are **9119.59** and **9158.693**, respectively. These are not directly interpretable without comparison to other models, but given the other metrics, they suggest a poor fit.
- **Final Gradient Norm**: The value is **5.0376E-04**, which is close to zero, indicating convergence.


### **Interpretation of Parameter Signs**
- **beta_time (-4.37)**: The negative sign indicates that as the time variable increases, the utility decreases. This is intuitive and expected: people prefer options that take less time.
- **asc_cycling (-3.36)**: The negative alternative-specific constant (ASC) for cycling suggests a lower inherent preference for cycling compared to the base alternative.
- **asc_pt (-0.72)**: The negative ASC for public transport (PT) indicates a lower inherent preference for public transport compared to the base alternative.
- **lambda_boxcox (0.78)**: This is the estimated lambda for the Box-Cox transformation of the cost variable. A value between 0 and 1 suggests a transformation between a log and a linear relationship.
- **beta_cost (-0.19)**: The negative sign for the cost parameter means that as the cost increases, the utility decreases. This is expected, as people prefer cheaper options.
- **asc_driving (-0.91)**: The negative ASC for driving suggests a lower inherent preference for driving compared to the base alternative.


### **Statistical Significance**
- **Robust p-values**:
  - All parameters except `lambda_boxcox` and `beta_cost` have **p-values of 0.0**, indicating they are **statistically significant**
  - `lambda_boxcox` has a p-value of **0.001**, which is still statistically significant.
  - `beta_cost` has a p-value of **0.003**, which is also statistically significant.
- **Robust t-statistics**:
  - All t-statistics are far from zero (absolute values much greater than 2), confirming the statistical significance of each parameter.

#### Compare Modelpref and Model 3 with an appropriate statistical test. Explain your choice of test. State the null hypothesis and the result of the test. Denote the preferred model as Modelpref. [1 point]

As model 3 and $Model_{pref}$ are not nested, we cannot use a regular log-likelihood test, we need to do another test such as a Cox test

In [None]:
# Define base model

base_model = model_0 # Basic model before creation of model 1 & 2
results_pref = results_model_0

In [68]:
# Import the parameters estimated in both models
parameters_model_pref = get_pandas_estimated_parameters(results_pref)
parameters_model_3 = get_pandas_estimated_parameters(results_model_3)

In [103]:
# Extract estimated betas from both models
def get_betas_dict(results):
    params = results.getEstimatedParameters()
    return dict(zip(params.index, params['Value']))


def compute_chosen_probs(df, betas, boxcox=False, lambda_name='lambda_boxcox'):
    """
    df : DataFrame contenant les données brutes
    betas : dict contenant les valeurs des coefficients (Name -> Value)
    boxcox : booléen pour indiquer si on applique la transformation Box–Cox
    lambda_name : nom du paramètre lambda dans le dictionnaire betas
    """
    # Extraction des coefficients
    b_time = betas['beta_time']
    b_cost = betas['beta_cost']
    asc_cyc = betas['asc_cycling']
    asc_pt = betas['asc_pt']
    asc_drv = betas['asc_driving']
    lam = betas[lambda_name] if boxcox else None
    
    dur_pt_tot = df['dur_pt_bus'] + df['dur_pt_access'] + df['dur_pt_int']
    cost_drive = df['cost_driving_fuel'] + df['cost_driving_ccharge']

    # Cost transform
    if boxcox:
        eps = 1e-6
        x_pt = np.maximum(df['cost_transit'], eps)
        x_dr = np.maximum(cost_drive, eps)
        cost_pt = (x_pt**lam - 1)/lam if lam != 0 else np.log(x_pt)
        cost_dr = (x_dr**lam - 1)/lam if lam != 0 else np.log(x_dr)
    else:
        cost_pt = df['cost_transit']
        cost_dr = cost_drive

    # Utilities
    V = np.column_stack([
        b_time * df['dur_walking'],
        asc_cyc + b_time * df['dur_cycling'],
        asc_pt + b_time * dur_pt_tot + b_cost * cost_pt,
        asc_drv + b_time * df['dur_driving'] + b_cost * cost_dr
    ])
    # Logit probabilities
    vmax = V.max(axis=1, keepdims=True)
    expV = np.exp(V - vmax)
    P = expV / expV.sum(axis=1, keepdims=True)

    # Probability of chosen alternative
    chosen = df['travel_mode'].astype(int).values - 1
    p_chosen = np.clip(P[np.arange(len(df)), chosen], 1e-300, 1.0)
    return p_chosen

In [104]:
# Compute chosen probabilities for both models
betas_pref = get_pandas_estimated_parameters(results_pref)
betas_3 = get_pandas_estimated_parameters(results_model_3)

betas_model_pref = dict(zip(betas_pref['Name'], betas_pref['Value']))
betas_model_3 = dict(zip(betas_3['Name'], betas_3['Value']))

p1 = compute_chosen_probs(data, betas_model_pref, boxcox=False)
p2 = compute_chosen_probs(data, betas_model_3, boxcox=True)

# Compute log probabilities and differences
logp1 = np.log(p1)
logp2 = np.log(p2)
r = logp1 - logp2

# Cox test statistic
N = len(r)
r_bar = np.mean(r)
s_r = np.std(r, ddof=1)
T = np.sqrt(N) * r_bar / s_r
p_value = 2 * (1 - norm.cdf(abs(T)))

print("=== Cox Test ===")
print(f"Mean diff in log-likelihoods per obs: {r_bar:.6f}")
print(f"Std dev: {s_r:.6f}")
print(f"T statistic: {T:.3f}")
print(f"p-value: {p_value:.5f}")

if p_value < 0.05:
    if r_bar > 0:
        print("→ Reject H0: Model Pref fits significantly better than Model 3.")
    else:
        print("→ Reject H0: Model 3 fits significantly better than Model Pref.")
else:
    print("→ Fail to reject H0: No significant difference in fit.")


=== Cox Test ===
Mean diff in log-likelihoods per obs: 0.001041
Std dev: 0.050517
T statistic: 1.457
p-value: 0.14502
→ Fail to reject H0: No significant difference in fit.
