In [None]:
!pip install statsmodels patsy

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.bayes_mixed_glm import PoissonBayesMixedGLM
from tqdm import tqdm
from patsy import dmatrices
from patsy import dmatrix
from statsmodels.discrete.count_model import ZeroInflatedPoisson

In [2]:
object_data = pd.read_csv('./2024_Jul_ob_count.csv')

In [3]:
# Create array of category names as they appear in the detections data. See paper for details of each category.
categories = ['car', 'person', 'trotro', 'stall', 'truck', 'stove', 'motorcycle', 'vendor', 'lorry', 'umbrella', 'bus', 'trash', 'taxi', 'van', 'debris', 'loudspeaker', 'bowl', 'food', 'animal', 'bicycle']

# Column names in the data frame for the number of counts of each category type in an image.
count_cols = [cat+'_counts' for cat in categories]

super_count_cols = ['people'+'_counts', 'small_vehicles'+'_counts', 'two_wheelers'+'_counts', 'large_vehicles'+'_counts', 'refuse'+'_counts', 'market'+'_counts', 'animal'+'_counts']

all_count_cols = count_cols + super_count_cols

vehicle_categories = ['car', 'trotro', 'truck', 'motorcycle', 'lorry', 'bus', 'taxi', 'van', 'bicycle']

# Define super categories
super_categories = {
    'people': ['person', 'vendor'],
    'small_vehicles': ['car', 'taxi', 'truck'],
    'two_wheelers': ['bicycle', 'motorcycle'],
    'large_vehicles': ['trotro', 'van', 'lorry', 'bus'],
    'refuse': ['trash', 'debris'],
    'market': ['umbrella', 'stall', 'bowl', 'food'],
    'animal': ['animal']
}

In [4]:
# Ensure datetime is in datetime format
object_data['datetime'] = pd.to_datetime(object_data['datetime_rectified'], format='%Y-%m-%d %H:%M:%S')

# Create additional time-related columns
object_data['hour'] = object_data['datetime'].dt.hour
object_data['day'] = object_data['datetime'].dt.dayofweek + 1  # +1 to match R's 1-indexing
object_data['week'] = object_data['datetime'].dt.isocalendar().week
object_data['year'] = object_data['datetime'].dt.year

# Split 'site_id_cam_angle' into 'site_id' and 'camera' columns
object_data[['site_id', 'camera']] = object_data['site_id_cam_angle'].str.split('_', expand=True)

# Fill missing values in 'camera' with 'single'
object_data['camera'].fillna('single', inplace=True)

# Filter data between specified dates
start_date = pd.Timestamp('2019-04-01')
end_date = pd.Timestamp('2024-04-01')
fixed_object_data = object_data[(object_data['datetime'] >= start_date) & (object_data['datetime'] <= end_date)]
fixed_object_data = fixed_object_data[fixed_object_data['view'] == 'clear']
# Display the first few rows of the new dataframe to verify
fixed_object_data.head()


Unnamed: 0,directory_name_rectified,site_id_cam_angle,view,image_name,datetime_rectified,date_rectified,animal_counts,bicycle_counts,bowl_counts,bus_counts,...,vendor_counts,directory_name_original,datetime_original,datetime,hour,day,week,year,site_id,camera
0,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3070.JPG,2024-03-01 08:32:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:32:02,2024-03-01 08:32:02,8,5,9,2024,AD,right
1,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3071.JPG,2024-03-01 08:37:02,2024-03-01,0,1,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:37:02,2024-03-01 08:37:02,8,5,9,2024,AD,right
2,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3072.JPG,2024-03-01 08:42:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:42:02,2024-03-01 08:42:02,8,5,9,2024,AD,right
3,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3073.JPG,2024-03-01 08:47:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:47:02,2024-03-01 08:47:02,8,5,9,2024,AD,right
4,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3074.JPG,2024-03-01 08:52:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:52:02,2024-03-01 08:52:02,8,5,9,2024,AD,right


In [5]:
# Sum counts for each super category
for super_cat, categories in super_categories.items():
    # Create a column for each supercategory by summing its categories
    fixed_object_data[super_cat + '_counts'] = fixed_object_data[[cat + '_counts' for cat in categories]].sum(axis=1)

In [6]:
# Step 7: Round the 'datetime' to the nearest hour
fixed_object_data['datetime_hour'] = fixed_object_data['datetime'].dt.round('H')

# Step 8: Sum the counts within each hour for each camera at each site
hourly_counts = fixed_object_data.groupby(['site_id', 'camera', 'datetime_hour'])[count_cols].sum().reset_index()

# Step 9: Group by the rounded 'datetime', 'date', and 'site_id', then calculate the mean for each object category
hourly_averages = hourly_counts.groupby(['site_id', 'datetime_hour'])[count_cols].mean().reset_index()

# Step 10: Add the date column from the rounded 'datetime_hour'
hourly_averages['date'] = hourly_averages['datetime_hour'].dt.date

# Create additional time-related columns
hourly_averages['hour'] = hourly_averages['datetime_hour'].dt.hour
hourly_averages['day'] = hourly_averages['datetime_hour'].dt.dayofweek + 1  # +1 to match R's 1-indexing
hourly_averages['week'] = hourly_averages['datetime_hour'].dt.isocalendar().week
hourly_averages['year'] = hourly_averages['datetime_hour'].dt.year


# Step 11: Reorder the columns to match the requested format
final_columns = ['datetime_hour', 'date', 'site_id', 'hour', 'day', 'week', 'year'] + count_cols
hourly_averages = hourly_averages[final_columns]

# Step 12: Rename 'datetime_hour' to 'datetime' to match the final requested column name
hourly_averages.rename(columns={'datetime_hour': 'datetime'}, inplace=True)

# Display the first few rows of the new dataframe to verify
hourly_averages.head()

Unnamed: 0,datetime,date,site_id,hour,day,week,year,car_counts,person_counts,trotro_counts,...,bus_counts,trash_counts,taxi_counts,van_counts,debris_counts,loudspeaker_counts,bowl_counts,food_counts,animal_counts,bicycle_counts
0,2019-04-12 10:00:00,2019-04-12,AD,10,5,15,2019,9.0,32.0,3.0,...,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2019-04-12 11:00:00,2019-04-12,AD,11,5,15,2019,228.0,219.0,30.5,...,0.5,0.0,45.5,4.0,0.0,0.0,0.5,0.0,0.0,0.5
2,2019-04-12 12:00:00,2019-04-12,AD,12,5,15,2019,314.0,230.5,49.5,...,2.0,0.5,59.5,4.0,0.5,0.0,0.5,0.0,0.0,2.5
3,2019-04-12 13:00:00,2019-04-12,AD,13,5,15,2019,371.5,192.5,43.5,...,2.0,1.5,81.0,8.5,0.5,0.0,1.0,0.0,0.0,0.5
4,2019-04-12 14:00:00,2019-04-12,AD,14,5,15,2019,351.5,205.0,60.5,...,1.0,0.0,62.0,3.5,0.0,0.0,0.5,0.0,0.0,1.0


In [None]:
# Define the counts column (e.g., 'car_counts') to model
counts_column = 'people_counts'

# Ensure the count data is greater than zero before log transformation
model_fixed_object_data = fixed_object_data[fixed_object_data[counts_column] > 0].copy()

# Ensure all necessary data types are compatible
model_fixed_object_data['hour'] = model_fixed_object_data['hour'].astype(int)
model_fixed_object_data['day'] = model_fixed_object_data['day'].astype(int)
model_fixed_object_data['week'] = model_fixed_object_data['week'].astype(int)
model_fixed_object_data['year'] = model_fixed_object_data['year'].astype(int)
model_fixed_object_data['site_id'] = model_fixed_object_data['site_id'].astype(str)

# Construct the formula for the mixed effects model
fixed_effect_formula = '0 + year'
random_effects = {
    "hour": '0 + C(hour)',
    "day": '0 + C(day)',
    "week": '0 + C(week)',
    "site_id": '0 + C(site_id)'
}

# Prepare the fixed effects data
X = dmatrix(fixed_effect_formula, data=model_fixed_object_data, return_type='dataframe')

# Prepare the random effects data
exog_vc_parts = [dmatrix(val, data=model_fixed_object_data, return_type='dataframe') for val in random_effects.values()]
exog_vc = np.hstack(exog_vc_parts)

# Construct ident array matching the number of columns in exog_vc
ident = np.concatenate([np.repeat(i, exog_vc_parts[i].shape[1]) for i in range(len(exog_vc_parts))])

# Verify ident and exog_vc lengths
print("Length of ident:", len(ident))
print("Number of columns in exog_vc:", exog_vc.shape[1])

# Fit the Poisson GLMM with Bayesian estimation using PoissonBayesMixedGLM
model = PoissonBayesMixedGLM(endog=model_fixed_object_data[counts_column].values, exog=X.values, exog_vc=exog_vc, ident=ident)
result = model.fit_vb()

# Print the summary of the model
print(result.summary())

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrix
import numpy as np
# Define the counts column (e.g., 'car_counts') to model
counts_column = 'person_counts'

# Ensure all necessary data types are compatible and treat them as categorical
model_fixed_object_data = hourly_averages[hourly_averages[counts_column] > 0].copy()
model_fixed_object_data['hour'] = model_fixed_object_data['hour'].astype('category')
model_fixed_object_data['day'] = model_fixed_object_data['day'].astype('category')
model_fixed_object_data['week'] = model_fixed_object_data['week'].astype('category')
model_fixed_object_data['year'] = model_fixed_object_data['year'].astype('category')
model_fixed_object_data['site_id'] = model_fixed_object_data['site_id'].astype('category')

# Fit the fixed effects model using GLM with the negative binomial family
glm_formula = f'{counts_column} ~ C(year)'
glm_model = smf.glm(formula=glm_formula, data=model_fixed_object_data, family=sm.families.NegativeBinomial())
glm_results = glm_model.fit()

# Print the summary of the fixed effects model
print(glm_results.summary())

# Add the predicted fixed effects to the dataset
model_fixed_object_data['fixed_effects'] = glm_results.fittedvalues

# Construct the formula for the mixed effects model
mixed_effects_formula = 'fixed_effects ~ 1'

# Fit the mixed effects model using MixedLM
mixed_model = smf.mixedlm(mixed_effects_formula, model_fixed_object_data, groups=model_fixed_object_data['site_id'], re_formula="~C(hour)+C(day)+C(week)")
mixed_results = mixed_model.fit()

# Print the summary of the mixed effects model
print(mixed_results.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:          person_counts   No. Observations:              3816556
Model:                            GLM   Df Residuals:                  3816550
Model Family:        NegativeBinomial   Df Model:                            5
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.3069e+07
Date:                Fri, 26 Jul 2024   Deviance:                   4.6104e+06
Time:                        15:39:08   Pearson chi2:                 5.41e+06
No. Iterations:                     6   Pseudo R-squ. (CS):           0.007899
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           2.5483      0.001   20

In [7]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from statsmodels.discrete.count_model import ZeroInflatedPoisson

# Define the counts column (e.g., 'car_counts') to model
counts_column = 'person_counts'

# Ensure all necessary data types are compatible and treat them as categorical
model_fixed_object_data = hourly_averages.copy()
model_fixed_object_data['hour'] = model_fixed_object_data['hour'].astype('category')
model_fixed_object_data['day'] = model_fixed_object_data['day'].astype('category')
model_fixed_object_data['week'] = model_fixed_object_data['week'].astype('category')
model_fixed_object_data['year'] = model_fixed_object_data['year'].astype('category')
model_fixed_object_data['site_id'] = model_fixed_object_data['site_id'].astype('category')

# Prepare the fixed effects formula for the count model
count_formula = f'{counts_column} ~ C(year) + C(hour) + C(day) + C(week)'

# Prepare the fixed effects formula for the zero inflation model (logit model)
# You can use different variables for the inflation model or the same
infl_formula = '1'  # Simple model with only intercept, can be extended

# Add the intercept to the count model (if not already included)
model_fixed_object_data = sm.add_constant(model_fixed_object_data, prepend=False)

# Fit the count model to get initial parameters
count_model = smf.glm(count_formula, data=model_fixed_object_data, family=sm.families.Poisson())
count_results = count_model.fit()

# Extract the fitted values from the count model to use as an offset in the zero-inflation model
model_fixed_object_data['offset'] = np.log(count_results.fittedvalues + 1e-5)

# Define the exog and exog_infl for the Zero-Inflated Poisson model
exog = model_fixed_object_data[['const', 'year', 'hour', 'day', 'week']]
exog_infl = model_fixed_object_data[['const']]

# Convert categorical variables to dummy variables
exog = pd.get_dummies(exog, drop_first=True)
exog_infl = pd.get_dummies(exog_infl, drop_first=True)

# Fit the Zero-Inflated Poisson model
zip_model = ZeroInflatedPoisson(
    endog=model_fixed_object_data[counts_column],
    exog=exog,
    exog_infl=exog_infl,
    inflation='logit'
)

# Fit the model
zip_results = zip_model.fit()

# Print the summary of the model
print(zip_results.summary())


ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
from statsmodels.discrete.count_model import ZeroInflatedPoisson

# Define the counts column (e.g., 'car_counts') to model
counts_column = 'people_counts'

# Ensure all necessary data types are compatible and treat them as categorical
model_fixed_object_data = fixed_object_data.copy()
model_fixed_object_data['hour'] = model_fixed_object_data['hour'].astype('category')
model_fixed_object_data['day'] = model_fixed_object_data['day'].astype('category')
model_fixed_object_data['week'] = model_fixed_object_data['week'].astype('category')
model_fixed_object_data['year'] = model_fixed_object_data['year'].astype('category')
model_fixed_object_data['site_id'] = model_fixed_object_data['site_id'].astype('category')

# Fit the mixed effects model using MixedLM for random effects
mixed_effects_formula = f'{counts_column} ~ year'
mixed_model = smf.mixedlm(mixed_effects_formula, model_fixed_object_data, groups=model_fixed_object_data['site_id'], re_formula="~C(hour)+C(day)+C(week)")
mixed_results = mixed_model.fit()

# Print the summary of the mixed effects model
print(mixed_results.summary())

# Use the fitted random effects from the mixed model
model_fixed_object_data['mixed_effects'] = mixed_results.fittedvalues

# Fit the Zero-Inflated Poisson model using the fixed effects
zip_formula = f'{counts_column} ~ year'
zip_model = ZeroInflatedPoisson.from_formula(
    formula=zip_formula, 
    data=model_fixed_object_data,
    exog_infl='1',
    offset=model_fixed_object_data['mixed_effects'],
    inflation='logit'
)

# Fit the zero-inflated model
zip_results = zip_model.fit()

# Print the summary of the zero-inflated model
print(zip_results.summary())