In [None]:
!pip install statsmodels patsy

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.bayes_mixed_glm import PoissonBayesMixedGLM
from tqdm import tqdm
from patsy import dmatrices
from patsy import dmatrix
from statsmodels.discrete.count_model import ZeroInflatedPoisson

In [2]:
object_data = pd.read_csv('./2024_Jul_ob_count.csv')

In [3]:
# Create array of category names as they appear in the detections data. See paper for details of each category.
categories = ['car', 'person', 'trotro', 'stall', 'truck', 'stove', 'motorcycle', 'vendor', 'lorry', 'umbrella', 'bus', 'trash', 'taxi', 'van', 'debris', 'loudspeaker', 'bowl', 'food', 'animal', 'bicycle']

# Column names in the data frame for the number of counts of each category type in an image.
count_cols = [cat+'_counts' for cat in categories]

super_count_cols = ['people'+'_counts', 'small_vehicles'+'_counts', 'two_wheelers'+'_counts', 'large_vehicles'+'_counts', 'refuse'+'_counts', 'market'+'_counts', 'animal'+'_counts']

all_count_cols = count_cols + super_count_cols

vehicle_categories = ['car', 'trotro', 'truck', 'motorcycle', 'lorry', 'bus', 'taxi', 'van', 'bicycle']

# Define super categories
super_categories = {
    'people': ['person', 'vendor'],
    'small_vehicles': ['car', 'taxi', 'truck'],
    'two_wheelers': ['bicycle', 'motorcycle'],
    'large_vehicles': ['trotro', 'van', 'lorry', 'bus'],
    'refuse': ['trash', 'debris'],
    'market': ['umbrella', 'stall', 'bowl', 'food'],
    'animal': ['animal']
}

In [14]:
# Ensure datetime is in datetime format
object_data['datetime'] = pd.to_datetime(object_data['datetime_rectified'], format='%Y-%m-%d %H:%M:%S')

# Create additional time-related columns
object_data['hour'] = object_data['datetime'].dt.hour
object_data['day'] = object_data['datetime'].dt.dayofweek + 1  # +1 to match R's 1-indexing
object_data['week'] = object_data['datetime'].dt.isocalendar().week
object_data['year'] = object_data['datetime'].dt.year

# Split 'site_id_cam_angle' into 'site_id' and 'camera' columns
object_data[['site_id', 'camera']] = object_data['site_id_cam_angle'].str.split('_', expand=True)

# Fill missing values in 'camera' with 'single'
object_data['camera'].fillna('single', inplace=True)

# Filter data between specified dates
start_date = pd.Timestamp('2019-04-01')
end_date = pd.Timestamp('2024-04-01')
fixed_object_data = object_data[(object_data['datetime'] >= start_date) & (object_data['datetime'] <= end_date)]
fixed_object_data = fixed_object_data[fixed_object_data['view'] == 'clear']
# Display the first few rows of the new dataframe to verify
fixed_object_data.head()


Unnamed: 0,directory_name_rectified,site_id_cam_angle,view,image_name,datetime_rectified,date_rectified,animal_counts,bicycle_counts,bowl_counts,bus_counts,...,vendor_counts,directory_name_original,datetime_original,datetime,hour,day,week,year,site_id,camera
0,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3070.JPG,2024-03-01 08:32:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:32:02,2024-03-01 08:32:02,8,5,9,2024,AD,right
1,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3071.JPG,2024-03-01 08:37:02,2024-03-01,0,1,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:37:02,2024-03-01 08:37:02,8,5,9,2024,AD,right
2,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3072.JPG,2024-03-01 08:42:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:42:02,2024-03-01 08:42:02,8,5,9,2024,AD,right
3,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3073.JPG,2024-03-01 08:47:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:47:02,2024-03-01 08:47:02,8,5,9,2024,AD,right
4,AD_01_03_2024_C22_S15,AD_right,clear,MFDC3074.JPG,2024-03-01 08:52:02,2024-03-01,0,0,0,0,...,0,AD_01_03_2024_C22_S15,2024-03-01 08:52:02,2024-03-01 08:52:02,8,5,9,2024,AD,right


In [15]:
# Sum counts for each super category
for super_cat, categories in super_categories.items():
    # Create a column for each supercategory by summing its categories
    fixed_object_data[super_cat + '_counts'] = fixed_object_data[[cat + '_counts' for cat in categories]].sum(axis=1)

In [22]:
import pandas as pd

# Ensure 'datetime_hour' is created properly
fixed_object_data['datetime_hour'] = fixed_object_data['datetime'].dt.round('H')
assert 'datetime_hour' in fixed_object_data.columns, "'datetime_hour' column is missing"

# Print the first few rows of fixed_object_data to verify 'datetime_hour'
print("fixed_object_data:")
print(fixed_object_data.head())

# Sum the counts within each hour for each camera at each site and include directory_name_rectified and camera info
hourly_counts = fixed_object_data.groupby(['site_id', 'camera', 'datetime_hour', 'directory_name_rectified'])[all_count_cols].sum().reset_index()
assert 'datetime_hour' in hourly_counts.columns, "'datetime_hour' column is missing after groupby"

# Print the first few rows of hourly_counts to verify the groupby operation
print("\nhourly_counts:")
print(hourly_counts.head())

# Define the aggregation functions for each column
def aggregate_directories(x):
    print("Aggregating directories:", x)
    return '|'.join(sorted(set(x)))

def aggregate_cameras(x):
    print("Aggregating cameras:", x)
    return ','.join(sorted(set(x)))

# Group by the rounded 'datetime', 'site_id', then calculate the mean for each object category
agg_dict = {col: 'mean' for col in all_count_cols}
agg_dict['directory_name_rectified'] = aggregate_directories
agg_dict['camera'] = aggregate_cameras

print("Aggregation dictionary:")
print(agg_dict)

# Perform the aggregation step by step
grouped = hourly_counts.groupby(['site_id', 'datetime_hour'])

# Aggregate numeric columns
hourly_averages = grouped.agg({col: 'mean' for col in all_count_cols}).reset_index()

# Aggregate directory_name_rectified
hourly_averages['directory_name_rectified'] = grouped['directory_name_rectified'].apply(aggregate_directories).values

# Aggregate camera
hourly_averages['camera'] = grouped['camera'].apply(aggregate_cameras).values

assert 'datetime_hour' in hourly_averages.columns, "'datetime_hour' column is missing after aggregation"

# Print the first few rows of hourly_averages to verify the aggregation
print("\nhourly_averages:")
print(hourly_averages.head())

# Add the date column from the rounded 'datetime_hour'
hourly_averages['date'] = hourly_averages['datetime_hour'].dt.date

# Create additional time-related columns
hourly_averages['hour'] = hourly_averages['datetime_hour'].dt.hour
hourly_averages['day'] = hourly_averages['datetime_hour'].dt.dayofweek + 1  # +1 to match R's 1-indexing
hourly_averages['week'] = hourly_averages['datetime_hour'].dt.isocalendar().week
hourly_averages['year'] = hourly_averages['datetime_hour'].dt.year

# Add the left and right camera indicator columns
hourly_averages['left_cam'] = hourly_averages['camera'].apply(lambda x: 1 if 'left' in x else 0)
hourly_averages['right_cam'] = hourly_averages['camera'].apply(lambda x: 1 if 'right' in x else 0)

# Reorder the columns to match the requested format
final_columns = ['datetime_hour', 'date', 'site_id', 'hour', 'day', 'week', 'year', 'directory_name_rectified', 'left_cam', 'right_cam'] + all_count_cols
hourly_averages = hourly_averages[final_columns]

# Rename 'datetime_hour' to 'datetime' to match the final requested column name
hourly_averages.rename(columns={'datetime_hour': 'datetime', 'directory_name_rectified': 'directory_pair'}, inplace=True)

# Display the first few rows of the new dataframe to verify
print("\nfinal hourly_averages:")
print(hourly_averages.head())


fixed_object_data:
  directory_name_rectified site_id_cam_angle   view    image_name  \
0    AD_01_03_2024_C22_S15          AD_right  clear  MFDC3070.JPG   
1    AD_01_03_2024_C22_S15          AD_right  clear  MFDC3071.JPG   
2    AD_01_03_2024_C22_S15          AD_right  clear  MFDC3072.JPG   
3    AD_01_03_2024_C22_S15          AD_right  clear  MFDC3073.JPG   
4    AD_01_03_2024_C22_S15          AD_right  clear  MFDC3074.JPG   

    datetime_rectified date_rectified  animal_counts  bicycle_counts  \
0  2024-03-01 08:32:02     2024-03-01              0               0   
1  2024-03-01 08:37:02     2024-03-01              0               1   
2  2024-03-01 08:42:02     2024-03-01              0               0   
3  2024-03-01 08:47:02     2024-03-01              0               0   
4  2024-03-01 08:52:02     2024-03-01              0               0   

   bowl_counts  bus_counts  ...  year  site_id  camera  people_counts  \
0            0           0  ...  2024       AD   right      

AttributeError: 'DataFrame' object has no attribute 'name'

In [None]:
# Step 7: Round the 'datetime' to the nearest hour
fixed_object_data['datetime_hour'] = fixed_object_data['datetime'].dt.round('H')

# Step 8: Sum the counts within each hour for each camera at each site
hourly_counts = fixed_object_data.groupby(['site_id', 'camera', 'datetime_hour'])[all_count_cols].sum().reset_index()

# Step 9: Group by the rounded 'datetime', 'date', and 'site_id', then calculate the mean for each object category
hourly_averages = hourly_counts.groupby(['site_id', 'datetime_hour'])[all_count_cols].mean().reset_index()

# Step 10: Add the date column from the rounded 'datetime_hour'
hourly_averages['date'] = hourly_averages['datetime_hour'].dt.date

# Create additional time-related columns
hourly_averages['hour'] = hourly_averages['datetime_hour'].dt.hour
hourly_averages['day'] = hourly_averages['datetime_hour'].dt.dayofweek + 1  # +1 to match R's 1-indexing
hourly_averages['week'] = hourly_averages['datetime_hour'].dt.isocalendar().week
hourly_averages['year'] = hourly_averages['datetime_hour'].dt.year


# Step 11: Reorder the columns to match the requested format
final_columns = ['datetime_hour', 'date', 'site_id', 'hour', 'day', 'week', 'year'] + all_count_cols
hourly_averages = hourly_averages[final_columns]

# Step 12: Rename 'datetime_hour' to 'datetime' to match the final requested column name
hourly_averages.rename(columns={'datetime_hour': 'datetime'}, inplace=True)

# Display the first few rows of the new dataframe to verify
hourly_averages.head()

In [None]:
from tqdm import tqdm
import pandas as pd
import numpy as np

# Create a function to determine the directory pair or single directory for each row
def get_directory_name(row):
    # Filter rows for the same site and datetime
    matching_rows = fixed_object_data[(fixed_object_data['site_id'] == row['site_id']) & 
                                      (fixed_object_data['datetime_hour'] == row['datetime'])]
    # Get unique directories and cameras
    unique_dirs = matching_rows['directory_name_rectified'].unique()
    unique_cameras = matching_rows['camera'].unique()
    
    if len(unique_dirs) == 1:
        return unique_dirs[0]  # Single camera or only one camera available
    else:
        return '|'.join(sorted(unique_dirs))  # Join directories with '|' to indicate pairs

# Initialize the tqdm progress bar
tqdm.pandas()

# Apply the function to create the new column with progress monitoring
hourly_averages['directory_pair'] = hourly_averages.progress_apply(get_directory_name, axis=1)

# Create indicator variables for left and right cameras
hourly_averages['left_cam'] = hourly_averages.progress_apply(
    lambda row: 1 if 'left' in fixed_object_data[(fixed_object_data['site_id'] == row['site_id']) & 
                                                 (fixed_object_data['datetime_hour'] == row['datetime'])]['camera'].unique() else 0,
    axis=1
)

hourly_averages['right_cam'] = hourly_averages.progress_apply(
    lambda row: 1 if 'right' in fixed_object_data[(fixed_object_data['site_id'] == row['site_id']) & 
                                                  (fixed_object_data['datetime_hour'] == row['datetime'])]['camera'].unique() else 0,
    axis=1
)


In [None]:
# Display the first few rows of the new dataframe to verify
hourly_averages.head()

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Decide whether to include "week" as a variable or not
week_bool = True

# OPTIONAL Step 1: Filter the data to include only those rows with counts > 0
filtered_data = hourly_averages.copy()
# filtered_data = hourly_averages[hourly_averages['people_counts'] > 0].copy()

# Step 2: Create the endogenous variable (response variable)
endog = filtered_data['people_counts'].astype(int)

# Step 3: Convert relevant columns to categorical
filtered_data['hour'] = filtered_data['hour'].astype('category')
filtered_data['day'] = filtered_data['day'].astype('category')
if week_bool == True:
    filtered_data['week'] = filtered_data['week'].astype('category')
filtered_data['site_id'] = filtered_data['site_id'].astype('category')
filtered_data['year'] = filtered_data['year'].astype('category')

# Step 4: One-hot encode 'hour', 'day', 'week', 'site_id', and 'year' for fixed effects
if week_bool == True:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'week', 'site_id', 'year']], drop_first=True)
else:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'site_id', 'year']], drop_first=True)

# Step 5: Add intercept
exog_fixed = sm.add_constant(exog_fixed)

# Step 6: Convert exog_fixed to float
exog_fixed = exog_fixed.astype(float)

# Step 7: Fit the GLM with a Negative Binomial family
glm_model = sm.GLM(endog, exog_fixed, family=sm.families.NegativeBinomial())
# glm_model = sm.GLM(endog, exog_fixed, family=sm.families.Poisson())
glm_result = glm_model.fit()

# Step 8: Display the results of the fixed effects
print(glm_result.summary())



In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Function to calculate the confidence intervals and exponentiate the coefficients
def calculate_effects_and_ci(glm_result, var):
    coef = glm_result.params[var]
    conf = glm_result.conf_int().loc[var]
    lower, upper = conf
    return np.exp(coef), np.exp(lower), np.exp(upper)

# Function to plot effects for a given variable
def plot_effects(glm_result, var, x_labels, title, ref_class_label):
    effects = [calculate_effects_and_ci(glm_result, v) for v in var]
    estimates, lower_bounds, upper_bounds = zip(*effects)
    
    # Add the reference class at the start
    estimates = [1] + list(estimates)
    lower_bounds = [1] + list(lower_bounds)
    upper_bounds = [1] + list(upper_bounds)
    x_labels = [ref_class_label] + x_labels
    
    plt.figure(figsize=(12, 6))
    plt.errorbar(range(len(x_labels)), estimates, yerr=[np.array(estimates) - np.array(lower_bounds), np.array(upper_bounds) - np.array(estimates)], fmt='o', ecolor='gray', capsize=5)
    plt.xticks(ticks=range(len(x_labels)), labels=x_labels, rotation=45)
    plt.xlabel('Categories')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    
    # Highlight the reference class differently
    plt.scatter(0, 1, color='red', zorder=5, label='Reference Class')
    plt.legend()
    
    plt.show()

# Variables to plot
hour_vars = [col for col in exog_fixed.columns if 'hour' in col and col != 'const']
day_vars = [col for col in exog_fixed.columns if 'day' in col and col != 'const']
site_vars = [col for col in exog_fixed.columns if 'site_id' in col and col != 'const']
year_vars = [col for col in exog_fixed.columns if 'year' in col and col != 'const']
week_vars = [col for col in exog_fixed.columns if 'week' in col and col != 'const'] if week_bool else []

# X-axis labels
hour_labels = [f'Hour {i}' for i in range(1, 24)]
day_labels = ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']  # Assuming 'Mon' is the reference category
site_labels = [label.split('_')[2] for label in site_vars]
year_labels = [label.split('_')[1] for label in year_vars]
week_labels = [label.split('_')[1] for label in week_vars]

# Ensure the number of labels matches the number of data points
assert len(hour_labels) == len(hour_vars)
assert len(day_labels) == len(day_vars)
assert len(site_labels) == len(site_vars)
assert len(year_labels) == len(year_vars)
if week_bool:
    assert len(week_labels) == len(week_vars)

# Plot effects
plot_effects(glm_result, hour_vars, hour_labels, 'Effect of Hour of Day on People Counts', 'Hour 0')
plot_effects(glm_result, day_vars, day_labels, 'Effect of Day of Week on People Counts', 'Mon')
plot_effects(glm_result, site_vars, site_labels, 'Effect of Site ID on People Counts', 'AD')
plot_effects(glm_result, year_vars, year_labels, 'Effect of Year on People Counts', '2019')

if week_bool:
    plot_effects(glm_result, week_vars, week_labels, 'Effect of Week on People Counts', 'Week 1')


In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Decide whether to include "week" as a variable or not
week_bool = False

# OPTIONAL Step 1: Filter the data to include only those rows with counts > 0
filtered_data = hourly_averages.copy()
# filtered_data = hourly_averages[hourly_averages['people_counts'] > 0].copy()

# Step 2: Create the endogenous variable (response variable)
endog = filtered_data['people_counts'].astype(int)

# Step 3: Convert relevant columns to categorical
filtered_data['hour'] = filtered_data['hour'].astype('category')
filtered_data['day'] = filtered_data['day'].astype('category')
if week_bool == True:
    filtered_data['week'] = filtered_data['week'].astype('category')
filtered_data['site_id'] = filtered_data['site_id'].astype('category')
filtered_data['year'] = filtered_data['year'].astype('category')

# Step 4: One-hot encode 'hour', 'day', 'week', 'site_id', and 'year' for fixed effects
if week_bool == True:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'week', 'site_id', 'year']], drop_first=True)
else:
    exog_fixed = pd.get_dummies(filtered_data[['hour', 'day', 'site_id', 'year']], drop_first=True)

# Add back the reference classes manually
exog_fixed['hour_0'] = (filtered_data['hour'] == '0').astype(float)
exog_fixed['day_1'] = (filtered_data['day'] == '1').astype(float)
exog_fixed['site_id_AD'] = (filtered_data['site_id'] == 'AD').astype(float)

# Step 5: Add interaction terms efficiently
interaction_terms_list = []
for col1 in ['hour_0'] + [f'hour_{i}' for i in range(1, 24)]:
    for col2 in ['day_1'] + [f'day_{i}' for i in range(2, 8)]:
        interaction_terms_list.append(exog_fixed[col1] * exog_fixed[col2])
        interaction_terms_list[-1].name = f'{col1}:{col2}'
    for col2 in ['site_id_AD'] + [f'site_id_{id}' for id in ['ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']]:
        interaction_terms_list.append(exog_fixed[col1] * exog_fixed[col2])
        interaction_terms_list[-1].name = f'{col1}:{col2}'
for col1 in ['day_1'] + [f'day_{i}' for i in range(2, 8)]:
    for col2 in ['site_id_AD'] + [f'site_id_{id}' for id in ['ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']]:
        interaction_terms_list.append(exog_fixed[col1] * exog_fixed[col2])
        interaction_terms_list[-1].name = f'{col1}:{col2}'

interaction_terms = pd.concat(interaction_terms_list, axis=1)

# Combine the original exog_fixed with interaction terms
exog_fixed = pd.concat([exog_fixed, interaction_terms], axis=1)

# Step 6: Add intercept
exog_fixed = sm.add_constant(exog_fixed)

# Step 7: Convert exog_fixed to float
exog_fixed = exog_fixed.astype(float)

# Step 8: Fit the GLM with a Negative Binomial family
glm_model = sm.GLM(endog, exog_fixed, family=sm.families.NegativeBinomial())
# glm_model = sm.GLM(endog, exog_fixed, family=sm.families.Poisson())
glm_result = glm_model.fit()

# Step 9: Display the results of the fixed effects
print(glm_result.summary())

In [None]:
# Step 9: Prepare data for random effects model
# Note: MixedLM in statsmodels is primarily for linear mixed models and does not support Negative Binomial directly

# Fit the mixed effects model with random effects
if week_bool == True:
    random_effects = pd.get_dummies(filtered_data[['hour', 'day', 'week', 'site_id']], drop_first=True)
else:
    random_effects = pd.get_dummies(filtered_data[['hour', 'day', 'site_id']], drop_first=True)

random_effects = sm.add_constant(random_effects).astype(float)

mixed_model = sm.MixedLM(endog, exog_fixed, groups=filtered_data['site_id'])
mixed_result = mixed_model.fit()

# Step 10: Display the results of the mixed effects model
print(mixed_result.summary())
