In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP

# Decide whether to include "week" as a variable or not
week_bool = False

# Step 1: Do not filter the data for counts > 0
filtered_data = hourly_averages.copy()

# Step 2: Create the endogenous variable (response variable)
endog = filtered_data['people_counts'].astype(int)

# Step 3: Convert relevant columns to categorical
filtered_data['hour'] = filtered_data['hour'].astype('category')
filtered_data['day'] = filtered_data['day'].astype('category')
if week_bool == True:
    filtered_data['week'] = filtered_data['week'].astype('category')
filtered_data['site_id'] = filtered_data['site_id'].astype('category')
filtered_data['year'] = filtered_data['year'].astype('category')

# Step 4: One-hot encode 'hour', 'day', 'week', 'site_id', and 'year' for fixed effects
if week_bool == True:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'week', 'site_id', 'year']], drop_first=True)
else:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'site_id', 'year']], drop_first=True)

# Step 5: Add intercept
exog_fixed = sm.add_constant(exog_fixed)

# Step 6: Convert exog_fixed to float
exog_fixed = exog_fixed.astype(float)

# Step 7: Fit the Zero-Inflated Negative Binomial model with progress monitoring
zinb_model = ZeroInflatedNegativeBinomialP(endog, exog_fixed, exog_infl=exog_fixed, inflation='logit')
zinb_result = zinb_model.fit(maxiter=100, disp=1)  # maxiter controls the number of iterations, disp enables convergence messages

# Step 8: Display the results of the fixed effects
print(zinb_result.summary())


In [None]:
# # Define the counts column (e.g., 'car_counts') to model
# counts_column = 'people_counts'

# # Ensure the count data is greater than zero before log transformation
# model_fixed_object_data = fixed_object_data[fixed_object_data[counts_column] > 0].copy()

# # Ensure all necessary data types are compatible
# model_fixed_object_data['hour'] = model_fixed_object_data['hour'].astype(int)
# model_fixed_object_data['day'] = model_fixed_object_data['day'].astype(int)
# model_fixed_object_data['week'] = model_fixed_object_data['week'].astype(int)
# model_fixed_object_data['year'] = model_fixed_object_data['year'].astype(int)
# model_fixed_object_data['site_id'] = model_fixed_object_data['site_id'].astype(str)

# # Construct the formula for the mixed effects model
# fixed_effect_formula = '0 + year'
# random_effects = {
#     "hour": '0 + C(hour)',
#     "day": '0 + C(day)',
#     "week": '0 + C(week)',
#     "site_id": '0 + C(site_id)'
# }

# # Prepare the fixed effects data
# X = dmatrix(fixed_effect_formula, data=model_fixed_object_data, return_type='dataframe')

# # Prepare the random effects data
# exog_vc_parts = [dmatrix(val, data=model_fixed_object_data, return_type='dataframe') for val in random_effects.values()]
# exog_vc = np.hstack(exog_vc_parts)

# # Construct ident array matching the number of columns in exog_vc
# ident = np.concatenate([np.repeat(i, exog_vc_parts[i].shape[1]) for i in range(len(exog_vc_parts))])

# # Verify ident and exog_vc lengths
# print("Length of ident:", len(ident))
# print("Number of columns in exog_vc:", exog_vc.shape[1])

# # Fit the Poisson GLMM with Bayesian estimation using PoissonBayesMixedGLM
# model = PoissonBayesMixedGLM(endog=model_fixed_object_data[counts_column].values, exog=X.values, exog_vc=exog_vc, ident=ident)
# result = model.fit_vb()

# # Print the summary of the model
# print(result.summary())