In [1]:
!pip install statsmodels



In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
# Create array of category names as they appear in the detections data. See paper for details of each category.
categories = ['car', 'person', 'trotro', 'stall', 'truck', 'stove', 'motorcycle', 'vendor', 'lorry', 'umbrella', 'bus', 'trash', 'taxi', 'van', 'debris', 'loudspeaker', 'bowl', 'food', 'animal', 'bicycle']

# Column names in the data frame for the number of counts of each category type in an image.
count_cols = [cat+'_counts' for cat in categories]

super_count_cols = ['people'+'_counts', 'small_vehicles'+'_counts', 'two_wheelers'+'_counts', 'large_vehicles'+'_counts', 'refuse'+'_counts', 'market'+'_counts', 'animal'+'_counts']

all_count_cols = count_cols + super_count_cols

vehicle_categories = ['car', 'trotro', 'truck', 'motorcycle', 'lorry', 'bus', 'taxi', 'van', 'bicycle']

# Define super categories
super_categories = {
    'people': ['person', 'vendor'],
    'small_vehicles': ['car', 'taxi', 'truck'],
    'two_wheelers': ['bicycle', 'motorcycle'],
    'large_vehicles': ['trotro', 'van', 'lorry', 'bus'],
    'refuse': ['trash', 'debris'],
    'market': ['umbrella', 'stall', 'bowl', 'food'],
    'animal': ['animal']
}

In [3]:
# # Load the model coefficients from the CSV file
# site = 'ASH'
# model_df = pd.read_csv(f'./{site}_model_coefficients.csv')



In [4]:
import pandas as pd
import os

# Define the path to your fold directories and other relevant variables
fold_directories = [f'fold_{i}' for i in range(1, 6)]
sites = ['AD']  # Replace with your actual site names
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']  # Replace with your actual categories

# Function to diagnose the presence of terms in each fold
def diagnose_fold_data(site, super_category):
    for fold in range(1, 6):
        fold_dir = f'fold_{fold}'
        filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
        
        if os.path.exists(filepath):
            print(f"Checking {filepath}...")
            model_df = pd.read_csv(filepath)
            
            if model_df.empty:
                print(f"Warning: The file {filepath} is empty.")
                continue
            
            # Check for the presence of specific terms
            hour_vars = [var for var in model_df['term'] if 'hour' in var]
            day_vars = [var for var in model_df['term'] if 'day' in var]
            year_vars = [var for var in model_df['term'] if 'year' in var]
            week_vars = [var for var in model_df['term'] if 'week' in var]
            
            print(f"Fold {fold} Summary for {super_category} at site {site}:")
            print(f"- Number of 'hour' terms: {len(hour_vars)}")
            print(f"- Number of 'day' terms: {len(day_vars)}")
            print(f"- Number of 'year' terms: {len(year_vars)}")
            print(f"- Number of 'week' terms: {len(week_vars)}")
            
            # Further diagnostics: Print specific issues
            if len(hour_vars) == 0:
                print(f"  Warning: No 'hour' terms found in fold {fold}.")
            if len(day_vars) == 0:
                print(f"  Warning: No 'day' terms found in fold {fold}.")
            if len(year_vars) == 0:
                print(f"  Warning: No 'year' terms found in fold {fold}.")
            if len(week_vars) == 0:
                print(f"  Warning: No 'week' terms found in fold {fold}.")
                
        else:
            print(f"Error: The file {filepath} does not exist.")

# Loop over each site and super category to perform the diagnosis
for site in sites:
    for super_category in super_categories:
        print(f"\nStarting diagnosis for site: {site}, super category: {super_category}")
        diagnose_fold_data(site, super_category)
        print("--------------------------------------------------")



Starting diagnosis for site: AD, super category: people
Checking fold_1/AD_people_fold_1_conditional_model_coefficients.csv...
Fold 1 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_2/AD_people_fold_2_conditional_model_coefficients.csv...
Fold 2 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_3/AD_people_fold_3_conditional_model_coefficients.csv...
Fold 3 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_4/AD_people_fold_4_conditional_model_coefficients.csv...
Fold 4 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_5/AD_people_fold_5_conditional_model_coef

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the confidence intervals and coefficients
def calculate_effects_and_ci(row):
    coef = row['estimate']
    lower = row['conf.low']
    upper = row['conf.high']
    return coef, lower, upper

# Function to combine uncertainties in quadrature
def combine_uncertainties_in_quadrature(uncertainties):
    """
    Combine a list of uncertainties using quadrature.
    
    Parameters:
    uncertainties (list or array-like): List of uncertainties (standard deviations).
    
    Returns:
    float: The combined uncertainty.
    """
    return np.sqrt(np.sum(np.square(uncertainties)))/len(uncertainties)

# Function to plot effects for the site with all folds
def plot_effects_all_folds(model_dfs, x_labels, title, ref_class_label, site, super_category):
    plt.figure(figsize=(12, 6))

    for i, (fold, model_df) in enumerate(model_dfs.items()):
        estimates, lower_bounds, upper_bounds = zip(*model_df.apply(calculate_effects_and_ci, axis=1))
        
        x_labels_mod = [ref_class_label] + x_labels
        
        estimates = [1] + list(estimates)
        lower_bounds = [1] + list(lower_bounds)
        upper_bounds = [1] + list(upper_bounds)
        
        # Plotting the effects with error bars for each fold
        plt.errorbar(
            range(len(x_labels_mod)), 
            estimates, 
            yerr=[np.array(estimates) - np.array(lower_bounds), np.array(upper_bounds) - np.array(estimates)], 
            fmt='o', 
            capsize=5,
            label=f'Fold {fold}'
        )
    
    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    plt.xticks(ticks=range(len(x_labels_mod)), labels=x_labels_mod, rotation=45)
    plt.xlabel('Categories')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_all_folds.png', bbox_inches='tight')
    plt.close()

# Function to aggregate effects across folds
def plot_aggregate_effects(model_dfs, x_labels, title, ref_class_label, site, super_category):
    plt.figure(figsize=(12, 6))
    
    estimates_list = []
    lower_uncertainties = []
    upper_uncertainties = []
    
    for fold, model_df in model_dfs.items():
        estimates, lower_bounds, upper_bounds = zip(*model_df.apply(calculate_effects_and_ci, axis=1))
        estimates_list.append(estimates)
        
        # Calculate the uncertainties as the magnitude of deviation from the mean estimate
        lower_uncertainties.append(np.array(estimates) - np.array(lower_bounds))
        upper_uncertainties.append(np.array(upper_bounds) - np.array(estimates))
    
    # Convert lists to numpy arrays for easier manipulation
    estimates_arr = np.array(estimates_list)
    lower_uncertainties_arr = np.array(lower_uncertainties)
    upper_uncertainties_arr = np.array(upper_uncertainties)
    
    # Calculate mean estimates
    mean_estimates = np.mean(estimates_arr, axis=0)
    
    # Calculate combined uncertainties using quadrature
    combined_lower_uncertainties = [combine_uncertainties_in_quadrature([uncertainty[i] for uncertainty in lower_uncertainties_arr]) for i in range(len(mean_estimates))]
    combined_upper_uncertainties = [combine_uncertainties_in_quadrature([uncertainty[i] for uncertainty in upper_uncertainties_arr]) for i in range(len(mean_estimates))]
    
    x_labels_mod = [ref_class_label] + x_labels
    mean_estimates = [1] + list(mean_estimates)
    combined_lower_uncertainties = [0] + list(combined_lower_uncertainties)
    combined_upper_uncertainties = [0] + list(combined_upper_uncertainties)
    
    # Plotting the aggregated effects with combined uncertainties
    plt.errorbar(
        range(len(x_labels_mod)), 
        mean_estimates, 
        yerr=[combined_lower_uncertainties, combined_upper_uncertainties], 
        fmt='o', 
        capsize=5,
        label='Aggregated across folds'
    )
    
    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    plt.xticks(ticks=range(len(x_labels_mod)), labels=x_labels_mod, rotation=45)
    plt.xlabel('Categories')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_aggregated.png', bbox_inches='tight')
    plt.close()

# Loop over all files matching the pattern in each fold directory
fold_directories = [f'fold_{i}' for i in range(1, 6)]

sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH'] 
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal'] 

for site in tqdm(sites):  # Loop over each site
    for super_category in super_categories:
        
        model_dfs = {}
        for fold in range(1, 6):
            fold_dir = f'fold_{fold}'
            filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold] = pd.read_csv(filepath)
        
        if model_dfs:
            # Variables to plot
            first_model_df = model_dfs[1]  # Using the first fold to determine available terms
            hour_vars = [var for var in first_model_df['term'] if 'hour' in var]
            day_vars = [var for var in first_model_df['term'] if 'day' in var]
            year_vars = [var for var in first_model_df['term'] if 'year' in var]
            week_vars = [var for var in first_model_df['term'] if 'week' in var]
            
            # X-axis labels
            hour_labels = [f'Hour {i}' for i in range(1, 24)]
            day_labels = ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']  # Assuming 'Mon' is the reference category
            year_labels = ['2020', '2021', '2022', '2023', '2024']  # Assuming '2019' is the reference category
            week_labels = [f'Week {i}' for i in range(2, 54)]  # Assuming 'Week 1' is the reference category
            
            # Ensure the number of labels matches the number of data points
            hour_labels_filtered = [label for var, label in zip(hour_vars, hour_labels) if var in first_model_df['term'].values]
            day_labels_filtered = [label for var, label in zip(day_vars, day_labels) if var in first_model_df['term'].values]
            year_labels_filtered = [label for var, label in zip(year_vars, year_labels) if var in first_model_df['term'].values]
            week_labels_filtered = [label for var, label in zip(week_vars, week_labels) if var in first_model_df['term'].values]
            
            # Plot each category for all folds
            if hour_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(hour_vars)] for fold in model_dfs},
                    hour_labels_filtered,
                    f'Effect of Hour of Day on {super_category.capitalize()} Counts',
                    'Hour 0',
                    site,
                    super_category
                )
            
            if day_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(day_vars)] for fold in model_dfs},
                    day_labels_filtered,
                    f'Effect of Day of Week on {super_category.capitalize()} Counts',
                    'Mon',
                    site,
                    super_category
                )
            
            if year_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(year_vars)] for fold in model_dfs},
                    year_labels_filtered,
                    f'Effect of Year on {super_category.capitalize()} Counts',
                    '2019',
                    site,
                    super_category
                )
            
            if week_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(week_vars)] for fold in model_dfs},
                    week_labels_filtered,
                    f'Effect of Week on {super_category.capitalize()} Counts',
                    'Week 1',
                    site,
                    super_category
                )

            # Also plot the aggregated effects across folds
            if hour_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(hour_vars)] for fold in model_dfs},
                    hour_labels_filtered,
                    f'Aggregated Effect of Hour of Day on {super_category.capitalize()} Counts',
                    'Hour 0',
                    site,
                    super_category
                )
            
            if day_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(day_vars)] for fold in model_dfs},
                    day_labels_filtered,
                    f'Aggregated Effect of Day of Week on {super_category.capitalize()} Counts',
                    'Mon',
                    site,
                    super_category
                )
            
            if year_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(year_vars)] for fold in model_dfs},
                    year_labels_filtered,
                    f'Aggregated Effect of Year on {super_category.capitalize()} Counts',
                    '2019',
                    site,
                    super_category
                )
            
            if week_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(week_vars)] for fold in model_dfs},
                    week_labels_filtered,
                    f'Aggregated Effect of Week on {super_category.capitalize()} Counts',
                    'Week 1',
                    site,
                    super_category
                )


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.38s/it]


In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the confidence intervals and coefficients
def calculate_effects_and_ci(row):
    coef = row['estimate']
    lower = row['conf.low']
    upper = row['conf.high']
    return coef, lower, upper

# Function to plot hour effects stratified by day of the week
def plot_hour_effects_by_day(model_df, hour_labels, title, site, super_category, show_whiskers=True):
    plt.figure(figsize=(12, 6))

    # Days of the week mapping (assuming 1 = Monday, 2 = Tuesday, etc.)
    days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    all_estimates = []  # List to keep track of all estimates across all days

    for day_idx, day in enumerate(days_of_week, start=1):
        estimates = []
        lower_bounds = []
        upper_bounds = []
        hour_indices = []

        if day == 'Mon':
            # For Monday, use the main effects
            for hour in range(1, 24):
                term = f'hour{hour}'
                row = model_df[model_df['term'] == term]
                if not row.empty:
                    coef, lower, upper = calculate_effects_and_ci(row.iloc[0])
                    estimates.append(coef)
                    lower_bounds.append(lower)
                    upper_bounds.append(upper)
                    hour_indices.append(hour-1)  # Store hour index for connecting lines
                else:
                    # Handle missing data
                    estimates.append(np.nan)
                    lower_bounds.append(np.nan)
                    upper_bounds.append(np.nan)
                    hour_indices.append(hour-1)  # Maintain hour index for consistency
        else:
            # For other days, combine main effects with interaction effects
            for hour in range(1, 24):
                main_term = f'hour{hour}'
                inter_term = f'hour{hour}:day{day_idx}'
                
                main_row = model_df[model_df['term'] == main_term]
                inter_row = model_df[model_df['term'] == inter_term]
                
                if not main_row.empty and not inter_row.empty:
                    main_coef, main_lower, main_upper = calculate_effects_and_ci(main_row.iloc[0])
                    inter_coef, inter_lower, inter_upper = calculate_effects_and_ci(inter_row.iloc[0])
                    
                    # Combine estimates in log space
                    combined_estimate = np.exp(np.log(main_coef) + np.log(inter_coef))
                    combined_lower = np.exp(np.log(main_lower) + np.log(inter_lower))
                    combined_upper = np.exp(np.log(main_upper) + np.log(inter_upper))
                    
                    estimates.append(combined_estimate)
                    lower_bounds.append(combined_lower)
                    upper_bounds.append(combined_upper)
                    hour_indices.append(hour-1)  # Store hour index for connecting lines
                else:
                    # Handle missing data
                    estimates.append(np.nan)
                    lower_bounds.append(np.nan)
                    upper_bounds.append(np.nan)
                    hour_indices.append(hour-1)  # Maintain hour index for consistency

        # Convert lists to numpy arrays and remove any NaN values before plotting
        estimates = np.array(estimates)
        lower_bounds = np.array(lower_bounds)
        upper_bounds = np.array(upper_bounds)
        hour_indices = np.array(hour_indices)  # Convert hour indices to numpy array
        
        valid_indices = ~np.isnan(estimates)
        
        estimates = estimates[valid_indices]
        lower_bounds = lower_bounds[valid_indices]
        upper_bounds = upper_bounds[valid_indices]
        valid_hour_indices = hour_indices[valid_indices]  # Filter hour indices to match valid estimates
        valid_hour_labels = np.array(hour_labels)[valid_indices]

        # Store all estimates for y-axis limit calculation
        all_estimates.extend(estimates)

        if len(estimates) > 0:
            if show_whiskers:
                plt.errorbar(
                    valid_hour_indices, 
                    estimates, 
                    yerr=[estimates - lower_bounds, upper_bounds - estimates], 
                    fmt='none', 
                    capsize=5,
                    alpha=0.125,  # Faint whiskers
                    elinewidth=1.5,
                    ecolor='blue',  # Default color for whiskers
                    errorevery=1,
                    capthick=1.5
                )
            
            # Plot the central points with full visibility
            plt.errorbar(
                valid_hour_indices, 
                estimates, 
                yerr=None,  # No whiskers here
                fmt='o', 
                capsize=5,
                label=f'Day: {day}',
                alpha=1.0,  # Fully visible central points
                markersize=12,
                elinewidth=1.5,
                errorevery=1,
                capthick=1.5
            )
            
            # Plot lines connecting points
            plt.plot(
                valid_hour_indices, 
                estimates, 
                linestyle='-', 
                color=plt.gca().lines[-1].get_color(),  # Use the same color as the points
                alpha=0.75,  # Slightly transparent line
                linewidth=2
            )

    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    # Adjust y-axis limit to focus on the central tendency
    if all_estimates:
        max_estimate = max(all_estimates)
        plt.ylim(0, max_estimate * 1.1)  # Set the upper limit to 1.1 times the max estimate

    plt.xticks(ticks=range(len(valid_hour_labels)), labels=valid_hour_labels, rotation=45)
    plt.xlabel('Hour of Day')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_hour_by_day.png', bbox_inches='tight')
    plt.close()

# Loop over all files matching the pattern in each fold directory
fold_directories = [f'interaction_fold_{i}' for i in range(1, 6)]

sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

for site in tqdm(sites):  # Loop over each site
    for super_category in super_categories:
        
        model_dfs = {}
        for fold in range(1, 6):
            fold_dir = f'interaction_fold_{fold}'
            filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold] = pd.read_csv(filepath)
        
        if model_dfs:
            # Using the first fold to determine available terms
            first_model_df = model_dfs[1]  
            hour_labels = [f'Hour {i}' for i in range(1, 24)]
            
            # Plot the effects of hour stratified by day of the week
            plot_hour_effects_by_day(
                first_model_df, 
                hour_labels,
                f'Effect of Hour of Day Stratified by Day (Interaction) - {super_category.capitalize()}',
                site,
                super_category,
                show_whiskers=True  # Turn whiskers on or off
            )


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:40<00:00,  4.05s/it]


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import r2_score
from tqdm import tqdm

# Function to calculate R² values across folds
def calculate_r2_across_folds(true_values, predicted_values):
    r2_values = []
    for true, predicted in zip(true_values, predicted_values):
        r2_values.append(r2_score(true, predicted))
    return r2_values

# Function to plot true vs predicted values for a site and super category
def plot_true_vs_predicted(site, super_category, true_values, predicted_values, title, output_dir):
    plt.figure(figsize=(8, 8))
    
    # Plotting each fold with different colors and lower alpha for transparency
    for i, (true, predicted) in enumerate(zip(true_values, predicted_values), start=1):
        plt.scatter(true, predicted, label=f'Fold {i}', alpha=0.3, s=20)  # Lower alpha for better visibility
    
    # Plotting the x=y reference line
    min_val = min(min([min(t) for t in true_values]), min([min(p) for p in predicted_values]))
    max_val = max(max([max(t) for t in true_values]), max([max(p) for p in predicted_values]))
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2)
    
    # Calculating R² values
    all_true = np.concatenate(true_values)
    all_predicted = np.concatenate(predicted_values)
    overall_r2 = r2_score(all_true, all_predicted)
    fold_r2_values = calculate_r2_across_folds(true_values, predicted_values)
    
    # Annotating the plot with R² values
    plt.text(0.05, 0.95, f'Overall R²: {overall_r2:.2f}\n'
                         f'Min R²: {min(fold_r2_values):.2f}\n'
                         f'Mean R²: {np.mean(fold_r2_values):.2f}\n'
                         f'Max R²: {max(fold_r2_values):.2f}', 
             fontsize=12, verticalalignment='top', transform=plt.gca().transAxes)
    
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title(title)
    plt.legend(loc='lower right', bbox_to_anchor=(1, 0))  # Adjust legend location to avoid overlap with R² text
    plt.grid(True)
    
    # Ensuring the plot is on the same scale
    plt.xlim([min_val, max_val])
    plt.ylim([min_val, max_val])
    
    # Save the figure
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}.png', bbox_inches='tight')
    plt.close()

# Main function to process all folds and plot results
def process_and_plot(site, super_category, folds_dir, output_dir):
    true_values_fixed_only = []
    predicted_values_fixed_only = []
    true_values_with_random = []
    predicted_values_with_random = []
    
    for fold in range(1, 6):
        file_path = f'{folds_dir}/fold_{fold}/{site}_{super_category}_fold_{fold}_model_predictions.csv'
        if os.path.exists(file_path):
            data = pd.read_csv(file_path)
            true_values_fixed_only.append(data[f'{super_category}_counts'].values)
            predicted_values_fixed_only.append(data['predicted_counts_fixed_only'].values)
            true_values_with_random.append(data[f'{super_category}_counts'].values)
            predicted_values_with_random.append(data['predicted_counts_with_random'].values)
    
    if true_values_fixed_only and predicted_values_fixed_only:
        # Plot for fixed-only effects
        plot_true_vs_predicted(site, super_category, true_values_fixed_only, predicted_values_fixed_only, 
                               f'True vs Predicted (Fixed Effects Only) - {super_category.capitalize()}', 
                               f'{output_dir}/{site}/{super_category}')
        
    if true_values_with_random and predicted_values_with_random:
        # Plot for fixed and random effects
        plot_true_vs_predicted(site, super_category, true_values_with_random, predicted_values_with_random, 
                               f'True vs Predicted (Fixed and Random Effects) - {super_category.capitalize()}', 
                               f'{output_dir}/{site}/{super_category}')

# Example usage:
output_directory = './results/time_series'
folds_directory = './'

# Assuming you have site names and super categories
sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

for site in tqdm(sites):
    for super_category in super_categories:
        process_and_plot(site, super_category, folds_directory, output_directory)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:07<00:00,  6.73s/it]


In [141]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the central coefficient (estimate)
def calculate_effects(row):
    return row['estimate']

# Function to create data arrays for each time unit
def create_data_array_for_time_unit(model_dfs, time_vars, sites, super_category):
    data_array = []
    
    for site in sites:  # Iterate over the predefined site order
        site_data = [1]  # Start with the reference class (set to 1)
        for time_var in time_vars:
            estimates = []
            for fold, site_dfs in model_dfs.items():
                df_key = f'{site}_{super_category}'
                if df_key in site_dfs:
                    df = site_dfs[df_key]
                    # Check if the 'term' column has the correct time variable
                    term_rows = df[df['term'] == time_var]
                    if not term_rows.empty:
                        estimates.append(calculate_effects(term_rows.iloc[0]))  # Extract the coefficient for this fold
            if estimates:
                avg_estimate = np.mean(estimates)  # Average the estimates over folds
            else:
                avg_estimate = np.nan  # If no estimates found, return nan
            site_data.append(avg_estimate)
        data_array.append(site_data)
    
    return np.array(data_array)

# Load model data for all folds
fold_directories = [f'fold_{i}' for i in range(1, 6)]
sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH'] 
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

# Define time unit labels for hour, day, week, and year
time_unit_definitions = {
    'hour': [f'hour{i}' for i in range(1, 24)],  # 24 hours in a day
    'day': [f'day{i}' for i in range(2, 8)],  # 7 days in a week (assuming day1 is reference)
    'week': [f'week{i}' for i in range(2, 54)],  # 52 weeks in a year (Week 1 assumed to be reference)
    'year': [f'year{i}' for i in range(2020, 2025)]  # Assuming 2019 is the reference year
}

# Initialize model_dfs to store data for all folds
model_dfs = {}
for fold in range(1, 6):
    model_dfs[fold] = {}  # Initialize for each fold
    for site in sites:
        for super_category in super_categories:
            filepath = f'{fold_directories[fold - 1]}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold][f'{site}_{super_category}'] = pd.read_csv(filepath)

# Ensure the order of site processing in the arrays matches the given site order
for super_category in super_categories:
    for time_unit, time_vars in time_unit_definitions.items():
        data_array = create_data_array_for_time_unit(model_dfs, time_vars, sites, super_category)
        
        # Ensure that rows (sites) are in the same order
        print(f"Object Category: {super_category}, Time Unit: {time_unit}")
        for site, row in zip(sites, data_array):
            print(f"{site}: {row}")
        print("\n" + "-"*40 + "\n")


Object Category: people, Time Unit: hour
AD: [ 1.          0.66082096  0.51701767  0.49436839  0.70393452  1.79726094
 17.03672215 26.76846911 28.61432273 29.8946619  29.71653682 29.75588418
 30.37243913 30.59546821 30.01908858 29.52476696 30.13177833 36.05228677
 29.35537793  5.1418541   3.49046131  2.5906173   1.80013007  1.21189992]
ASH: [ 1.          0.5799782   0.50231429  0.51106897  0.65826062  2.05810785
  8.95341413 17.07086167 19.11918595 17.86241423 17.26169722 15.48731328
 14.44936817 14.42128216 13.69560947 16.2947255  19.57762248 21.54503163
 21.24951716 10.36631776  8.53484641  6.95408124  4.15866291  1.83921342]
EL: [ 1.          0.62104226  0.46103104  0.36315417  0.35973272  0.70945056
  5.74245679  9.20560515  8.35473525  7.8463163   7.84769323  7.99406282
  8.12518009  8.01329117  8.32884369  9.08389769  9.43021237 11.12883884
  9.81500584  4.31610302  3.21357705  2.49860174  1.76157751  1.2943922 ]
JT: [1.         0.66810438 0.45078682 0.34716189 0.36590936 0.59457

In [5]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the central coefficient (estimate)
def calculate_effects(row):
    return row['estimate']

# Function to normalize the data array
def normalize_array(array):
    """Normalize each row of the array to be between 0 and 1."""
    array_min = np.nanmin(array, axis=1, keepdims=True)  # Find min for each row
    array_max = np.nanmax(array, axis=1, keepdims=True)  # Find max for each row
    
    # Handle edge cases where min == max (will result in NaNs)
    normalized_array = (array - array_min) / (array_max - array_min)
    
    # If a row has the same min and max, fill the normalized row with 0.5 (since they are all the same value)
    normalized_array = np.nan_to_num(normalized_array, nan=0.5)
    
    return normalized_array

# Function to create data arrays for each time unit
def create_data_array_for_time_unit(model_dfs, time_vars, sites, super_category):
    data_array = []
    
    for site in sites:  # Iterate over the predefined site order
        site_data = [1]  # Start with the reference class (set to 1)
        for time_var in time_vars:
            estimates = []
            for fold, site_dfs in model_dfs.items():
                df_key = f'{site}_{super_category}'
                if df_key in site_dfs:
                    df = site_dfs[df_key]
                    # Check if the 'term' column has the correct time variable
                    term_rows = df[df['term'] == time_var]
                    if not term_rows.empty:
                        estimates.append(calculate_effects(term_rows.iloc[0]))  # Extract the coefficient for this fold
            if estimates:
                avg_estimate = np.mean(estimates)  # Average the estimates over folds
            else:
                avg_estimate = np.nan  # If no estimates found, return nan
            site_data.append(avg_estimate)
        data_array.append(site_data)
    
    data_array = np.array(data_array)
    
    # Normalize the data array row-wise
    normalized_data_array = normalize_array(data_array)
    
    return normalized_data_array

# Load model data for all folds
fold_directories = [f'fold_{i}' for i in range(1, 6)]
sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH'] 
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

# Define time unit labels for hour, day, week, and year
time_unit_definitions = {
    'hour': [f'hour{i}' for i in range(1, 24)],  # 24 hours in a day
    'day': [f'day{i}' for i in range(2, 8)],  # 7 days in a week (assuming day1 is reference)
    'week': [f'week{i}' for i in range(2, 53)],  # 52 weeks in a year (Week 1 assumed to be reference)
    'year': [f'year{i}' for i in range(2020, 2025)]  # Assuming 2019 is the reference year
}

# Initialize model_dfs to store data for all folds
model_dfs = {}
for fold in range(1, 6):
    model_dfs[fold] = {}  # Initialize for each fold
    for site in sites:
        for super_category in super_categories:
            filepath = f'{fold_directories[fold - 1]}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold][f'{site}_{super_category}'] = pd.read_csv(filepath)

# Ensure the order of site processing in the arrays matches the given site order
for super_category in super_categories:
    for time_unit, time_vars in time_unit_definitions.items():
        data_array = create_data_array_for_time_unit(model_dfs, time_vars, sites, super_category)
        
        # Ensure that rows (sites) are in the same order
        print(f"Object Category: {super_category}, Time Unit: {time_unit}")
        for site, row in zip(sites, data_array):
            print(f"{site}: {row}")
        print("\n" + "-"*40 + "\n")


Object Category: people, Time Unit: hour
AD: [1.42199442e-02 4.68116767e-03 6.36968919e-04 0.00000000e+00
 5.89365573e-03 3.66414180e-02 4.65222783e-01 7.38909979e-01
 7.90821162e-01 8.26828309e-01 8.21818874e-01 8.22925445e-01
 8.40264900e-01 8.46537176e-01 8.30327576e-01 8.16425705e-01
 8.33496765e-01 1.00000000e+00 8.11661955e-01 1.30701850e-01
 8.42595141e-02 5.89530831e-02 3.67221069e-02 2.01792334e-02]
ASH: [2.36512093e-02 3.69077386e-03 0.00000000e+00 4.16043220e-04
 7.41094065e-03 7.39350120e-02 4.01616374e-01 7.87376797e-01
 8.84718041e-01 8.24993258e-01 7.96445757e-01 7.12122810e-01
 6.62797188e-01 6.61462474e-01 6.26976781e-01 7.50492959e-01
 9.06504036e-01 1.00000000e+00 9.85956449e-01 4.68760917e-01
 3.81725040e-01 3.06603318e-01 1.73758387e-01 6.35326280e-02]
EL: [5.94540786e-02 2.42647380e-02 9.40638171e-03 3.17709643e-04
 0.00000000e+00 3.24741753e-02 4.99830163e-01 8.21411947e-01
 7.42401685e-01 6.95190808e-01 6.95318667e-01 7.08910286e-01
 7.21085602e-01 7.10695796e-0

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from scipy.stats import spearmanr

# Create the output directory if it doesn't exist
output_dir = "results/time_series_by_time"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the manual x-axis labels for each time unit
x_labels_definitions = {
    'hour': [f'{i}' for i in range(0, 24)],  # 24 hours in a day, starting with '0' as reference
    'day': ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],  # Days of the week starting with Monday
    'week': [f'{i}' for i in range(1, 53)],  # 52 weeks in a year, starting with '1'
    'year': [f'{i}' for i in range(2019, 2025)]  # Years from 2019 to 2024
}

# Function to perform Spearman correlation
def check_spearman_trend(row):
    year_values = np.arange(5)  # Encode year as [0, 1, 2, 3, 4] for 2019–2023
    row_subset = row[:5]  # Use only the first 5 values (2019–2023)

    # Calculate Spearman correlation and p-value
    corr, p_value = spearmanr(year_values, row_subset)

    # If p-value is significant return the trend direction
    if p_value < 0.05:
        if corr > 0:
            return 'up'  # Positive correlation
        elif corr < 0:
            return 'down'  # Negative correlation
    return None

# Function to plot heatmaps
def plot_heatmaps_for_time_unit(data_arrays, time_unit, sites, super_categories):
    num_sites = len(sites)
    num_time_vars = len(x_labels_definitions[time_unit])  # Get the correct number of labels from the x-axis definition

    # Adjust figure size dynamically based on the number of time variables and sites
    fig_width = 1.5 * num_time_vars  # 1.5 unit width per time variable
    fig_height = 0.6 * num_sites * 3  # 0.6 unit height per site, times 3 (since there are 3 rows)

    fig, axes = plt.subplots(3, 3, figsize=(fig_width, fig_height))  # Create a 3x3 subplot grid

    # Flatten the axes array for easier indexing
    axes = axes.flatten()

    # Plot the heatmaps for each object category (7 subplots)
    for idx, super_category in enumerate(super_categories):
        sns.heatmap(
            data_arrays[super_category], ax=axes[idx], cmap="RdYlGn", vmin=0, vmax=1,  # Use fixed range 0-1 for colormap
            cbar=False,  # No colorbar as requested
            xticklabels=x_labels_definitions[time_unit], yticklabels=sites,  # Ensure correct labels
            square=False  # Adjust square=False for dynamic subplot sizes
        )
        axes[idx].set_title(super_category.replace('_', ' ').capitalize(), fontsize=14)
        axes[idx].set_xlabel(time_unit.capitalize(), fontsize=12)
        axes[idx].set_ylabel("Sites", fontsize=12)

        # If the time unit is 'year', check for Spearman correlation trend and annotate
        if time_unit == 'year':
            for row_idx, row in enumerate(data_arrays[super_category]):
                trend = check_spearman_trend(row)
                if trend == 'up':
                    axes[idx].text(len(row) + 0.5, row_idx + 0.5, '▲', color='green', fontsize=14, va='center', ha='center')
                elif trend == 'down':
                    axes[idx].text(len(row) + 0.5, row_idx + 0.5, '▼', color='red', fontsize=14, va='center', ha='center')

    # Turn off the last two empty subplots
    for ax in axes[7:]:
        ax.axis('off')

    # Adjust the layout and title for each time unit
    plt.suptitle(f"Heatmaps for {time_unit.capitalize()} across Object Categories", fontsize=18, y=1.02)
    plt.tight_layout(pad=1.0)

    # Save the figure to the output directory
    output_path = os.path.join(output_dir, f"{time_unit}_heatmaps.png")
    plt.savefig(output_path, bbox_inches="tight")
    plt.close()

# Example: Assuming you have the normalized data arrays from your previous code
# The data_arrays is a dictionary that contains the normalized data array for each super category
# for time_unit in ['hour', 'day', 'week', 'year']:
for time_unit in ['year']:
    # Create a dictionary of data arrays for all super categories
    data_arrays = {super_category: create_data_array_for_time_unit(model_dfs, time_unit_definitions[time_unit], sites, super_category) 
                   for super_category in super_categories}
    
    # Plot the heatmaps for this time unit with manual x-axis labels
    plot_heatmaps_for_time_unit(data_arrays, time_unit, sites, super_categories)
