In [1]:
!pip install statsmodels
!pip install pymannkendall

Collecting pymannkendall
  Downloading pymannkendall-1.4.3-py3-none-any.whl.metadata (14 kB)
Downloading pymannkendall-1.4.3-py3-none-any.whl (12 kB)
Installing collected packages: pymannkendall
Successfully installed pymannkendall-1.4.3


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
# Create array of category names as they appear in the detections data. See paper for details of each category.
categories = ['car', 'person', 'trotro', 'stall', 'truck', 'stove', 'motorcycle', 'vendor', 'lorry', 'umbrella', 'bus', 'trash', 'taxi', 'van', 'debris', 'loudspeaker', 'bowl', 'food', 'animal', 'bicycle']

# Column names in the data frame for the number of counts of each category type in an image.
count_cols = [cat+'_counts' for cat in categories]

super_count_cols = ['people'+'_counts', 'small_vehicles'+'_counts', 'two_wheelers'+'_counts', 'large_vehicles'+'_counts', 'refuse'+'_counts', 'market'+'_counts', 'animal'+'_counts']

all_count_cols = count_cols + super_count_cols

vehicle_categories = ['car', 'trotro', 'truck', 'motorcycle', 'lorry', 'bus', 'taxi', 'van', 'bicycle']

# Define super categories
super_categories = {
    'people': ['person', 'vendor'],
    'small_vehicles': ['car', 'taxi', 'truck'],
    'two_wheelers': ['bicycle', 'motorcycle'],
    'large_vehicles': ['trotro', 'van', 'lorry', 'bus'],
    'refuse': ['trash', 'debris'],
    'market': ['umbrella', 'stall', 'bowl', 'food'],
    'animal': ['animal']
}

In [39]:
# # Load the model coefficients from the CSV file
# site = 'ASH'
# model_df = pd.read_csv(f'./{site}_model_coefficients.csv')



['inflate_const', 'inflate_hour_1', 'inflate_hour_2', 'inflate_hour_3', 'inflate_hour_4', 'inflate_hour_5', 'inflate_hour_6', 'inflate_hour_7', 'inflate_hour_8', 'inflate_hour_9', 'inflate_hour_10', 'inflate_hour_11', 'inflate_hour_12', 'inflate_hour_13', 'inflate_hour_14', 'inflate_hour_15', 'inflate_hour_16', 'inflate_hour_17', 'inflate_hour_18', 'inflate_hour_19', 'inflate_hour_20', 'inflate_hour_21', 'inflate_hour_22', 'inflate_hour_23', 'inflate_day_2', 'inflate_day_3', 'inflate_day_4', 'inflate_day_5', 'inflate_day_6', 'inflate_day_7', 'inflate_week_2', 'inflate_week_3', 'inflate_week_4', 'inflate_week_5', 'inflate_week_6', 'inflate_week_7', 'inflate_week_8', 'inflate_week_9', 'inflate_week_10', 'inflate_week_11', 'inflate_week_12', 'inflate_week_13', 'inflate_week_14', 'inflate_week_15', 'inflate_week_16', 'inflate_week_17', 'inflate_week_18', 'inflate_week_19', 'inflate_week_20', 'inflate_week_21', 'inflate_week_22', 'inflate_week_23', 'inflate_week_24', 'inflate_week_25', 'inf

In [5]:
import pandas as pd
import os

# Define the path to your fold directories and other relevant variables
fold_directories = [f'fold_{i}' for i in range(1, 6)]
sites = ['AD']  # Replace with your actual site names
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']  # Replace with your actual categories

# Function to diagnose the presence of terms in each fold
def diagnose_fold_data(site, super_category):
    for fold in range(1, 6):
        fold_dir = f'fold_{fold}'
        filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
        
        if os.path.exists(filepath):
            print(f"Checking {filepath}...")
            model_df = pd.read_csv(filepath)
            
            if model_df.empty:
                print(f"Warning: The file {filepath} is empty.")
                continue
            
            # Check for the presence of specific terms
            hour_vars = [var for var in model_df['term'] if 'hour' in var]
            day_vars = [var for var in model_df['term'] if 'day' in var]
            year_vars = [var for var in model_df['term'] if 'year' in var]
            week_vars = [var for var in model_df['term'] if 'week' in var]
            
            print(f"Fold {fold} Summary for {super_category} at site {site}:")
            print(f"- Number of 'hour' terms: {len(hour_vars)}")
            print(f"- Number of 'day' terms: {len(day_vars)}")
            print(f"- Number of 'year' terms: {len(year_vars)}")
            print(f"- Number of 'week' terms: {len(week_vars)}")
            
            # Further diagnostics: Print specific issues
            if len(hour_vars) == 0:
                print(f"  Warning: No 'hour' terms found in fold {fold}.")
            if len(day_vars) == 0:
                print(f"  Warning: No 'day' terms found in fold {fold}.")
            if len(year_vars) == 0:
                print(f"  Warning: No 'year' terms found in fold {fold}.")
            if len(week_vars) == 0:
                print(f"  Warning: No 'week' terms found in fold {fold}.")
                
        else:
            print(f"Error: The file {filepath} does not exist.")

# Loop over each site and super category to perform the diagnosis
for site in sites:
    for super_category in super_categories:
        print(f"\nStarting diagnosis for site: {site}, super category: {super_category}")
        diagnose_fold_data(site, super_category)
        print("--------------------------------------------------")



Starting diagnosis for site: AD, super category: people
Checking fold_1/AD_people_fold_1_conditional_model_coefficients.csv...
Fold 1 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_2/AD_people_fold_2_conditional_model_coefficients.csv...
Fold 2 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_3/AD_people_fold_3_conditional_model_coefficients.csv...
Fold 3 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_4/AD_people_fold_4_conditional_model_coefficients.csv...
Fold 4 Summary for people at site AD:
- Number of 'hour' terms: 23
- Number of 'day' terms: 6
- Number of 'year' terms: 5
- Number of 'week' terms: 52
Checking fold_5/AD_people_fold_5_conditional_model_coef

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the confidence intervals and coefficients
def calculate_effects_and_ci(row):
    coef = row['estimate']
    lower = row['conf.low']
    upper = row['conf.high']
    return coef, lower, upper

# Function to combine uncertainties in quadrature
def combine_uncertainties_in_quadrature(uncertainties):
    """
    Combine a list of uncertainties using quadrature.
    
    Parameters:
    uncertainties (list or array-like): List of uncertainties (standard deviations).
    
    Returns:
    float: The combined uncertainty.
    """
    return np.sqrt(np.sum(np.square(uncertainties)))/len(uncertainties)

# Function to plot effects for the site with all folds
def plot_effects_all_folds(model_dfs, x_labels, title, ref_class_label, site, super_category):
    plt.figure(figsize=(12, 6))

    for i, (fold, model_df) in enumerate(model_dfs.items()):
        estimates, lower_bounds, upper_bounds = zip(*model_df.apply(calculate_effects_and_ci, axis=1))
        
        x_labels_mod = [ref_class_label] + x_labels
        
        estimates = [1] + list(estimates)
        lower_bounds = [1] + list(lower_bounds)
        upper_bounds = [1] + list(upper_bounds)
        
        # Plotting the effects with error bars for each fold
        plt.errorbar(
            range(len(x_labels_mod)), 
            estimates, 
            yerr=[np.array(estimates) - np.array(lower_bounds), np.array(upper_bounds) - np.array(estimates)], 
            fmt='o', 
            capsize=5,
            label=f'Fold {fold}'
        )
    
    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    plt.xticks(ticks=range(len(x_labels_mod)), labels=x_labels_mod, rotation=45)
    plt.xlabel('Categories')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_all_folds.png', bbox_inches='tight')
    plt.close()

# Function to aggregate effects across folds
def plot_aggregate_effects(model_dfs, x_labels, title, ref_class_label, site, super_category):
    plt.figure(figsize=(12, 6))
    
    estimates_list = []
    lower_uncertainties = []
    upper_uncertainties = []
    
    for fold, model_df in model_dfs.items():
        estimates, lower_bounds, upper_bounds = zip(*model_df.apply(calculate_effects_and_ci, axis=1))
        estimates_list.append(estimates)
        
        # Calculate the uncertainties as the magnitude of deviation from the mean estimate
        lower_uncertainties.append(np.array(estimates) - np.array(lower_bounds))
        upper_uncertainties.append(np.array(upper_bounds) - np.array(estimates))
    
    # Convert lists to numpy arrays for easier manipulation
    estimates_arr = np.array(estimates_list)
    lower_uncertainties_arr = np.array(lower_uncertainties)
    upper_uncertainties_arr = np.array(upper_uncertainties)
    
    # Calculate mean estimates
    mean_estimates = np.mean(estimates_arr, axis=0)
    
    # Calculate combined uncertainties using quadrature
    combined_lower_uncertainties = [combine_uncertainties_in_quadrature([uncertainty[i] for uncertainty in lower_uncertainties_arr]) for i in range(len(mean_estimates))]
    combined_upper_uncertainties = [combine_uncertainties_in_quadrature([uncertainty[i] for uncertainty in upper_uncertainties_arr]) for i in range(len(mean_estimates))]
    
    x_labels_mod = [ref_class_label] + x_labels
    mean_estimates = [1] + list(mean_estimates)
    combined_lower_uncertainties = [0] + list(combined_lower_uncertainties)
    combined_upper_uncertainties = [0] + list(combined_upper_uncertainties)
    
    # Plotting the aggregated effects with combined uncertainties
    plt.errorbar(
        range(len(x_labels_mod)), 
        mean_estimates, 
        yerr=[combined_lower_uncertainties, combined_upper_uncertainties], 
        fmt='o', 
        capsize=5,
        label='Aggregated across folds'
    )
    
    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    plt.xticks(ticks=range(len(x_labels_mod)), labels=x_labels_mod, rotation=45)
    plt.xlabel('Categories')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_aggregated.png', bbox_inches='tight')
    plt.close()

# Loop over all files matching the pattern in each fold directory
fold_directories = [f'fold_{i}' for i in range(1, 6)]

sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH'] 
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal'] 

for site in tqdm(sites):  # Loop over each site
    for super_category in super_categories:
        
        model_dfs = {}
        for fold in range(1, 6):
            fold_dir = f'fold_{fold}'
            filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold] = pd.read_csv(filepath)
        
        if model_dfs:
            # Variables to plot
            first_model_df = model_dfs[1]  # Using the first fold to determine available terms
            hour_vars = [var for var in first_model_df['term'] if 'hour' in var]
            day_vars = [var for var in first_model_df['term'] if 'day' in var]
            year_vars = [var for var in first_model_df['term'] if 'year' in var]
            week_vars = [var for var in first_model_df['term'] if 'week' in var]
            
            # X-axis labels
            hour_labels = [f'Hour {i}' for i in range(1, 24)]
            day_labels = ['Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']  # Assuming 'Mon' is the reference category
            year_labels = ['2020', '2021', '2022', '2023', '2024']  # Assuming '2019' is the reference category
            week_labels = [f'Week {i}' for i in range(2, 54)]  # Assuming 'Week 1' is the reference category
            
            # Ensure the number of labels matches the number of data points
            hour_labels_filtered = [label for var, label in zip(hour_vars, hour_labels) if var in first_model_df['term'].values]
            day_labels_filtered = [label for var, label in zip(day_vars, day_labels) if var in first_model_df['term'].values]
            year_labels_filtered = [label for var, label in zip(year_vars, year_labels) if var in first_model_df['term'].values]
            week_labels_filtered = [label for var, label in zip(week_vars, week_labels) if var in first_model_df['term'].values]
            
            # Plot each category for all folds
            if hour_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(hour_vars)] for fold in model_dfs},
                    hour_labels_filtered,
                    f'Effect of Hour of Day on {super_category.capitalize()} Counts',
                    'Hour 0',
                    site,
                    super_category
                )
            
            if day_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(day_vars)] for fold in model_dfs},
                    day_labels_filtered,
                    f'Effect of Day of Week on {super_category.capitalize()} Counts',
                    'Mon',
                    site,
                    super_category
                )
            
            if year_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(year_vars)] for fold in model_dfs},
                    year_labels_filtered,
                    f'Effect of Year on {super_category.capitalize()} Counts',
                    '2019',
                    site,
                    super_category
                )
            
            if week_vars:
                plot_effects_all_folds(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(week_vars)] for fold in model_dfs},
                    week_labels_filtered,
                    f'Effect of Week on {super_category.capitalize()} Counts',
                    'Week 1',
                    site,
                    super_category
                )

            # Also plot the aggregated effects across folds
            if hour_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(hour_vars)] for fold in model_dfs},
                    hour_labels_filtered,
                    f'Aggregated Effect of Hour of Day on {super_category.capitalize()} Counts',
                    'Hour 0',
                    site,
                    super_category
                )
            
            if day_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(day_vars)] for fold in model_dfs},
                    day_labels_filtered,
                    f'Aggregated Effect of Day of Week on {super_category.capitalize()} Counts',
                    'Mon',
                    site,
                    super_category
                )
            
            if year_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(year_vars)] for fold in model_dfs},
                    year_labels_filtered,
                    f'Aggregated Effect of Year on {super_category.capitalize()} Counts',
                    '2019',
                    site,
                    super_category
                )
            
            if week_vars:
                plot_aggregate_effects(
                    {fold: model_dfs[fold][model_dfs[fold]['term'].isin(week_vars)] for fold in model_dfs},
                    week_labels_filtered,
                    f'Aggregated Effect of Week on {super_category.capitalize()} Counts',
                    'Week 1',
                    site,
                    super_category
                )


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:23<00:00, 20.38s/it]


In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm

# Function to calculate the confidence intervals and coefficients
def calculate_effects_and_ci(row):
    coef = row['estimate']
    lower = row['conf.low']
    upper = row['conf.high']
    return coef, lower, upper

# Function to plot hour effects stratified by day of the week
def plot_hour_effects_by_day(model_df, hour_labels, title, site, super_category, show_whiskers=True):
    plt.figure(figsize=(12, 6))

    # Days of the week mapping (assuming 1 = Monday, 2 = Tuesday, etc.)
    days_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    all_estimates = []  # List to keep track of all estimates across all days

    for day_idx, day in enumerate(days_of_week, start=1):
        estimates = []
        lower_bounds = []
        upper_bounds = []
        hour_indices = []

        if day == 'Mon':
            # For Monday, use the main effects
            for hour in range(1, 24):
                term = f'hour{hour}'
                row = model_df[model_df['term'] == term]
                if not row.empty:
                    coef, lower, upper = calculate_effects_and_ci(row.iloc[0])
                    estimates.append(coef)
                    lower_bounds.append(lower)
                    upper_bounds.append(upper)
                    hour_indices.append(hour-1)  # Store hour index for connecting lines
                else:
                    # Handle missing data
                    estimates.append(np.nan)
                    lower_bounds.append(np.nan)
                    upper_bounds.append(np.nan)
                    hour_indices.append(hour-1)  # Maintain hour index for consistency
        else:
            # For other days, combine main effects with interaction effects
            for hour in range(1, 24):
                main_term = f'hour{hour}'
                inter_term = f'hour{hour}:day{day_idx}'
                
                main_row = model_df[model_df['term'] == main_term]
                inter_row = model_df[model_df['term'] == inter_term]
                
                if not main_row.empty and not inter_row.empty:
                    main_coef, main_lower, main_upper = calculate_effects_and_ci(main_row.iloc[0])
                    inter_coef, inter_lower, inter_upper = calculate_effects_and_ci(inter_row.iloc[0])
                    
                    # Combine estimates in log space
                    combined_estimate = np.exp(np.log(main_coef) + np.log(inter_coef))
                    combined_lower = np.exp(np.log(main_lower) + np.log(inter_lower))
                    combined_upper = np.exp(np.log(main_upper) + np.log(inter_upper))
                    
                    estimates.append(combined_estimate)
                    lower_bounds.append(combined_lower)
                    upper_bounds.append(combined_upper)
                    hour_indices.append(hour-1)  # Store hour index for connecting lines
                else:
                    # Handle missing data
                    estimates.append(np.nan)
                    lower_bounds.append(np.nan)
                    upper_bounds.append(np.nan)
                    hour_indices.append(hour-1)  # Maintain hour index for consistency

        # Convert lists to numpy arrays and remove any NaN values before plotting
        estimates = np.array(estimates)
        lower_bounds = np.array(lower_bounds)
        upper_bounds = np.array(upper_bounds)
        hour_indices = np.array(hour_indices)  # Convert hour indices to numpy array
        
        valid_indices = ~np.isnan(estimates)
        
        estimates = estimates[valid_indices]
        lower_bounds = lower_bounds[valid_indices]
        upper_bounds = upper_bounds[valid_indices]
        valid_hour_indices = hour_indices[valid_indices]  # Filter hour indices to match valid estimates
        valid_hour_labels = np.array(hour_labels)[valid_indices]

        # Store all estimates for y-axis limit calculation
        all_estimates.extend(estimates)

        if len(estimates) > 0:
            if show_whiskers:
                plt.errorbar(
                    valid_hour_indices, 
                    estimates, 
                    yerr=[estimates - lower_bounds, upper_bounds - estimates], 
                    fmt='none', 
                    capsize=5,
                    alpha=0.125,  # Faint whiskers
                    elinewidth=1.5,
                    ecolor='blue',  # Default color for whiskers
                    errorevery=1,
                    capthick=1.5
                )
            
            # Plot the central points with full visibility
            plt.errorbar(
                valid_hour_indices, 
                estimates, 
                yerr=None,  # No whiskers here
                fmt='o', 
                capsize=5,
                label=f'Day: {day}',
                alpha=1.0,  # Fully visible central points
                markersize=12,
                elinewidth=1.5,
                errorevery=1,
                capthick=1.5
            )
            
            # Plot lines connecting points
            plt.plot(
                valid_hour_indices, 
                estimates, 
                linestyle='-', 
                color=plt.gca().lines[-1].get_color(),  # Use the same color as the points
                alpha=0.75,  # Slightly transparent line
                linewidth=2
            )

    # Adding a thicker black line at y=1 to represent the reference level
    plt.axhline(y=1, color='black', linewidth=2, linestyle='--')
    
    # Adjust y-axis limit to focus on the central tendency
    if all_estimates:
        max_estimate = max(all_estimates)
        plt.ylim(0, max_estimate * 1.1)  # Set the upper limit to 1.1 times the max estimate

    plt.xticks(ticks=range(len(valid_hour_labels)), labels=valid_hour_labels, rotation=45)
    plt.xlabel('Hour of Day')
    plt.ylabel('Multiplicative Effect on Counts')
    plt.title(title)
    plt.grid(True)
    plt.legend()
    
    # Create directory if it doesn't exist
    output_dir = f'./results/time_series/{site}/{super_category}/'
    os.makedirs(output_dir, exist_ok=True)
    
    # Save the figure
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_hour_by_day.png', bbox_inches='tight')
    plt.close()

# Loop over all files matching the pattern in each fold directory
fold_directories = [f'interaction_fold_{i}' for i in range(1, 6)]

sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

for site in tqdm(sites):  # Loop over each site
    for super_category in super_categories:
        
        model_dfs = {}
        for fold in range(1, 6):
            fold_dir = f'interaction_fold_{fold}'
            filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
            if os.path.exists(filepath):
                model_dfs[fold] = pd.read_csv(filepath)
        
        if model_dfs:
            # Using the first fold to determine available terms
            first_model_df = model_dfs[1]  
            hour_labels = [f'Hour {i}' for i in range(1, 24)]
            
            # Plot the effects of hour stratified by day of the week
            plot_hour_effects_by_day(
                first_model_df, 
                hour_labels,
                f'Effect of Hour of Day Stratified by Day (Interaction) - {super_category.capitalize()}',
                site,
                super_category,
                show_whiskers=True  # Turn whiskers on or off
            )


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:40<00:00,  4.05s/it]


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.metrics import r2_score
from tqdm import tqdm

# Function to calculate R² values across folds
def calculate_r2_across_folds(true_values, predicted_values):
    r2_values = []
    for true, predicted in zip(true_values, predicted_values):
        r2_values.append(r2_score(true, predicted))
    return r2_values

# Function to plot true vs predicted values for a site and super category
def plot_true_vs_predicted(site, super_category, true_values, predicted_values, title, output_dir):
    plt.figure(figsize=(8, 8))
    
    # Plotting each fold with different colors and lower alpha for transparency
    for i, (true, predicted) in enumerate(zip(true_values, predicted_values), start=1):
        plt.scatter(true, predicted, label=f'Fold {i}', alpha=0.3, s=20)  # Lower alpha for better visibility
    
    # Plotting the x=y reference line
    min_val = min(min([min(t) for t in true_values]), min([min(p) for p in predicted_values]))
    max_val = max(max([max(t) for t in true_values]), max([max(p) for p in predicted_values]))
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2)
    
    # Calculating R² values
    all_true = np.concatenate(true_values)
    all_predicted = np.concatenate(predicted_values)
    overall_r2 = r2_score(all_true, all_predicted)
    fold_r2_values = calculate_r2_across_folds(true_values, predicted_values)
    
    # Annotating the plot with R² values
    plt.text(0.05, 0.95, f'Overall R²: {overall_r2:.2f}\n'
                         f'Min R²: {min(fold_r2_values):.2f}\n'
                         f'Mean R²: {np.mean(fold_r2_values):.2f}\n'
                         f'Max R²: {max(fold_r2_values):.2f}', 
             fontsize=12, verticalalignment='top', transform=plt.gca().transAxes)
    
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title(title)
    plt.legend(loc='lower right', bbox_to_anchor=(1, 0))  # Adjust legend location to avoid overlap with R² text
    plt.grid(True)
    
    # Ensuring the plot is on the same scale
    plt.xlim([min_val, max_val])
    plt.ylim([min_val, max_val])
    
    # Save the figure
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(f'{output_dir}/{title.replace(" ", "_")}.png', bbox_inches='tight')
    plt.close()

# Main function to process all folds and plot results
def process_and_plot(site, super_category, folds_dir, output_dir):
    true_values_fixed_only = []
    predicted_values_fixed_only = []
    true_values_with_random = []
    predicted_values_with_random = []
    
    for fold in range(1, 6):
        file_path = f'{folds_dir}/fold_{fold}/{site}_{super_category}_fold_{fold}_model_predictions.csv'
        if os.path.exists(file_path):
            data = pd.read_csv(file_path)
            true_values_fixed_only.append(data[f'{super_category}_counts'].values)
            predicted_values_fixed_only.append(data['predicted_counts_fixed_only'].values)
            true_values_with_random.append(data[f'{super_category}_counts'].values)
            predicted_values_with_random.append(data['predicted_counts_with_random'].values)
    
    if true_values_fixed_only and predicted_values_fixed_only:
        # Plot for fixed-only effects
        plot_true_vs_predicted(site, super_category, true_values_fixed_only, predicted_values_fixed_only, 
                               f'True vs Predicted (Fixed Effects Only) - {super_category.capitalize()}', 
                               f'{output_dir}/{site}/{super_category}')
        
    if true_values_with_random and predicted_values_with_random:
        # Plot for fixed and random effects
        plot_true_vs_predicted(site, super_category, true_values_with_random, predicted_values_with_random, 
                               f'True vs Predicted (Fixed and Random Effects) - {super_category.capitalize()}', 
                               f'{output_dir}/{site}/{super_category}')

# Example usage:
output_directory = './results/time_series'
folds_directory = './'

# Assuming you have site names and super categories
sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

for site in tqdm(sites):
    for super_category in super_categories:
        process_and_plot(site, super_category, folds_directory, output_directory)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:07<00:00,  6.73s/it]


In [12]:
# import pandas as pd
# import numpy as np
# import os
# import pymannkendall as mk
# import matplotlib.pyplot as plt
# from tqdm import tqdm

# # Adjusted function to generate the correct x-tick positions
# def generate_monthly_ticks(weekly_counts_df):
#     """
#     Generate the x-tick positions and labels for the first week of each month for each year.
    
#     Parameters:
#     weekly_counts_df (DataFrame): DataFrame containing 'year', 'week', and aggregated counts.
    
#     Returns:
#     tick_positions (list): List of x-axis positions for the first week of each month.
#     tick_labels (list): List of labels for the x-axis (e.g., 'Jan 2019', 'Feb 2019', etc.).
#     """
#     # Create mapping of approximate week numbers for the first week of each month
#     month_starts = {
#         'Jan': 1, 'Feb': 5, 'Mar': 9, 'Apr': 14, 'May': 18, 'Jun': 22,
#         'Jul': 27, 'Aug': 31, 'Sep': 36, 'Oct': 40, 'Nov': 45, 'Dec': 49
#     }
    
#     tick_positions = []
#     tick_labels = []
    
#     # Get the unique years in the data
#     years = sorted(weekly_counts_df['year'].unique())
    
#     for year in years:
#         for month, week in month_starts.items():
#             # Find the position of the first week of the month in the dataframe
#             pos = weekly_counts_df[(weekly_counts_df['year'] == year) & (weekly_counts_df['week'] >= week)].index.min()
            
#             if pd.notna(pos):
#                 tick_positions.append(pos)
#                 tick_labels.append(f'{month} {year}')
    
#     return tick_positions, tick_labels

# # Function to perform the Mann-Kendall test on weekly aggregated counts
# def perform_mk_test_on_weekly_counts(weekly_counts):
#     """
#     Perform the Mann-Kendall test on weekly counts.
    
#     Parameters:
#     weekly_counts (list): A list of weekly counts (aggregated by week and year).
    
#     Returns:
#     result: The Mann-Kendall test result.
#     """
#     return mk.original_test(weekly_counts)

# # Function to plot significant Mann-Kendall trends with monthly x-ticks
# def plot_significant_mk_trends(weeks, weekly_counts, result, title, site, super_category, weekly_counts_df, output_dir):
#     """
#     Plot the significant Mann-Kendall trends for weekly counts with monthly x-ticks.
    
#     Parameters:
#     weeks (list): List of week and year combinations.
#     weekly_counts (list): Weekly counts of the object category.
#     result (object): Result of the Mann-Kendall test.
#     title (str): Title of the plot.
#     site (str): Site identifier.
#     super_category (str): Super category identifier.
#     weekly_counts_df (DataFrame): DataFrame containing 'year', 'week', and aggregated counts for generating x-ticks.
#     output_dir (str): Directory to save the plot.
#     """
#     plt.figure(figsize=(12, 6))
    
#     # Plot the weekly counts
#     plt.plot(weeks, weekly_counts, marker='o', label='Weekly Counts')
    
#     # Add the trend line if there's a significant trend
#     slope = result.slope
#     intercept = np.mean(weekly_counts) - slope * np.mean(range(len(weekly_counts)))
#     trend_line = intercept + slope * np.arange(len(weekly_counts))
#     plt.plot(weeks, trend_line, 'r-', label=f'Trend Line (slope={slope:.4f}, p={result.p:.4f})', linewidth=2)
    
#     # Generate monthly ticks
#     tick_positions, tick_labels = generate_monthly_ticks(weekly_counts_df)
    
#     # Set the x-ticks to monthly labels
#     plt.xticks(ticks=tick_positions, labels=tick_labels, rotation=45)
    
#     plt.xlabel('Month-Year')
#     plt.ylabel('Predicted Counts (Fixed Only)')
#     plt.title(f'Significant MK Trend - {title} ({site} - {super_category})')
#     plt.legend()
#     plt.grid(True)
    
#     # Save the plot
#     os.makedirs(output_dir, exist_ok=True)
#     plt.savefig(f'{output_dir}/{title.replace(" ", "_")}_MK_trends.png', bbox_inches='tight')
#     plt.close()

# # Validation function for checking the correct order of time series data
# def validate_time_series_order(weekly_counts_df):
#     """
#     Validates if the data is ordered by year and week in a time series format.
    
#     Parameters:
#     weekly_counts_df (DataFrame): DataFrame containing 'year', 'week', and aggregated counts.
    
#     Returns:
#     bool: True if the data is ordered correctly, False otherwise.
#     """
#     # Check if the data is ordered by both 'year' and 'week'
#     ordered_df = weekly_counts_df.sort_values(by=['year', 'week']).reset_index(drop=True)
    
#     # Compare with the original dataframe
#     is_ordered = weekly_counts_df.equals(ordered_df)
    
#     if is_ordered:
#         print("The data is correctly ordered by year and week.")
#     else:
#         print("The data is NOT correctly ordered. Fixing the order...")
    
#     return is_ordered

# # Main function to process data, run Mann-Kendall test, and plot results
# def process_and_run_mk(site, super_category, folds_dir, output_dir):
#     # Dataframe to aggregate weekly predicted counts (fixed only) across folds
#     weekly_counts_df = pd.DataFrame()
    
#     for fold in range(1, 6):
#         file_path = f'{folds_dir}/fold_{fold}/{site}_{super_category}_fold_{fold}_model_predictions.csv'
#         if os.path.exists(file_path):
#             data = pd.read_csv(file_path)
            
#             # Group by week and year to get weekly predicted counts (fixed only)
#             weekly_counts = data.groupby(['year', 'week'])['predicted_counts_fixed_only'].sum().reset_index()
#             weekly_counts_df = pd.concat([weekly_counts_df, weekly_counts], ignore_index=True)
    
#     if not weekly_counts_df.empty:
#         # Validate the time series order
#         if not validate_time_series_order(weekly_counts_df):
#             weekly_counts_df = weekly_counts_df.sort_values(by=['year', 'week']).reset_index(drop=True)
        
#         # Generate weekly counts and weeks for plotting
#         weekly_counts_df['week_year'] = weekly_counts_df['year'].astype(str) + '-W' + weekly_counts_df['week'].astype(str)
#         weeks = weekly_counts_df['week_year'].tolist()
#         weekly_counts = weekly_counts_df['predicted_counts_fixed_only'].tolist()
        
#         # Perform the Mann-Kendall test
#         result = perform_mk_test_on_weekly_counts(weekly_counts)
        
#         # Bonferroni correction: p-value threshold divided by the number of super categories
#         corrected_alpha = 0.05 / len(super_categories)
        
#         # If the trend is statistically significant after Bonferroni correction, plot it
#         if result.p < corrected_alpha:
#             plot_significant_mk_trends(weeks, weekly_counts, result, 
#                                        f'MK Test for Weekly {super_category.capitalize()} Predicted Counts (Fixed Only)', 
#                                        site, super_category, weekly_counts_df, 
#                                        f'{output_dir}/{site}/{super_category}')
#         else:
#             print(f"No significant MK trends for {site} - {super_category}.")
    
# # Example usage:
# output_directory = './results/time_series'
# folds_directory = './'

# # Assuming you have site names and super categories
# sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH']
# super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal']

# for site in tqdm(sites):
#     for super_category in super_categories:
#         process_and_run_mk(site, super_category, folds_directory, output_directory)


  0%|                                                                                                                  | 0/10 [00:00<?, ?it/s]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 10%|██████████▌                                                                                               | 1/10 [00:02<00:22,  2.47s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for ASH - large_vehicles.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 20%|█████████████████████▏                                                                                    | 2/10 [00:04<00:16,  2.05s/it]

The data is NOT correctly ordered. Fixing the order...
No significant MK trends for ASH - animal.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for EL - small_vehicles.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for EL - refuse.
The data is NOT correctly ordered. Fixing the order...


 30%|███████████████████████████████▊                                                                          | 3/10 [00:06<00:14,  2.02s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 40%|██████████████████████████████████████████▍                                                               | 4/10 [00:08<00:13,  2.21s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for LA - large_vehicles.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 50%|█████████████████████████████████████████████████████                                                     | 5/10 [00:10<00:09,  1.88s/it]

The data is NOT correctly ordered. Fixing the order...
No significant MK trends for LA - refuse.
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for LA - animal.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for N1W - small_vehicles.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 60%|███████████████████████████████████████████████████████████████▌                                          | 6/10 [00:11<00:07,  1.81s/it]

The data is NOT correctly ordered. Fixing the order...
No significant MK trends for N1W - refuse.
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for N1W - animal.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for NM - market.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 70%|██████████████████████████████████████████████████████████████████████████▏                               | 7/10 [00:13<00:05,  1.90s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 80%|████████████████████████████████████████████████████████████████████████████████████▊                     | 8/10 [00:16<00:04,  2.12s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


 90%|███████████████████████████████████████████████████████████████████████████████████████████████▍          | 9/10 [00:18<00:02,  2.20s/it]

The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for UGH - market.
The data is NOT correctly ordered. Fixing the order...
No significant MK trends for UGH - two_wheelers.
The data is NOT correctly ordered. Fixing the order...
The data is NOT correctly ordered. Fixing the order...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:20<00:00,  2.05s/it]


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from tqdm import tqdm
import seaborn as sns

# Function to calculate the mean coefficients across folds for a specific site and time variable
def calculate_mean_effects_across_folds(model_dfs, time_vars, site):
    estimates = [1]  # Add reference class (1 for multiplicative effects)
    for time_var in time_vars:
        fold_estimates = []
        for fold, model_df in model_dfs.items():
            df_filtered = model_df[model_df['term'] == time_var]
            if not df_filtered.empty:
                coef, _, _ = calculate_effects_and_ci(df_filtered.iloc[0])
                fold_estimates.append(coef)
        # Calculate mean across folds
        mean_estimate = np.mean(fold_estimates) if fold_estimates else np.nan
        estimates.append(mean_estimate)
    return estimates

# Function to create heatmap data for all categories
def create_heatmap_data_for_category(model_dfs, time_vars, sites):
    heatmap_data = []
    for site in sites:
        mean_estimates = calculate_mean_effects_across_folds(model_dfs, time_vars, site)
        heatmap_data.append(mean_estimates)
    return np.array(heatmap_data)

# Function to plot heatmap for all object categories in a single figure
def plot_heatmap_for_time_unit(data_dict, x_labels, time_unit, output_dir):
    fig, axes = plt.subplots(3, 3, figsize=(18, 12), constrained_layout=True)
    axes = axes.flatten()

    for idx, (category, heatmap_data) in enumerate(data_dict.items()):
        if heatmap_data is not None:
            ax = axes[idx]
            sns.heatmap(
                heatmap_data, 
                ax=ax, 
                cmap="plasma", 
                cbar=False, 
                linewidths=0.5, 
                linecolor='gray',
                xticklabels=x_labels, 
                yticklabels=heatmap_data.index
            )
            ax.set_title(category.capitalize())
        else:
            axes[idx].axis('off')  # Turn off any unused axes

    # Turn off any remaining empty subplots
    for empty_ax in axes[len(data_dict):]:
        empty_ax.axis('off')

    plt.suptitle(f"{time_unit.capitalize()} Heatmap for All Object Categories")
    plt.savefig(os.path.join(output_dir, f"{time_unit}_heatmap.png"))
    plt.close()

# Main function to iterate over the time units and plot heatmaps for all object categories
def plot_heatmaps_by_time_unit(sites, super_categories, time_units):
    output_dir = './results/time_series_by_time/'
    os.makedirs(output_dir, exist_ok=True)

    for site in tqdm(sites):
        for super_category in super_categories:
            model_dfs = {}
            
            # Loop to construct filepaths and read data for each fold
            for fold in range(1, 6):
                fold_dir = f'fold_{fold}'
                filepath = f'{fold_dir}/{site}_{super_category}_fold_{fold}_conditional_model_coefficients.csv'
                if os.path.exists(filepath):
                    model_dfs[fold] = pd.read_csv(filepath)
            
            if not model_dfs:
                continue  # Skip if no data found for this combination

            for time_unit, time_vars, x_labels in time_units:
                data_dict = {}
                
                # Ensure x_labels includes the reference class
                x_labels_mod = [f'Ref'] + x_labels
                
                # Prepare heatmap data for each object category
                for super_category in super_categories:
                    heatmap_data = create_heatmap_data_for_category(model_dfs, time_vars, sites)
                    data_dict[super_category] = pd.DataFrame(heatmap_data, index=sites, columns=x_labels_mod)

                # Plot the heatmaps for the time unit
                plot_heatmap_for_time_unit(data_dict, x_labels_mod, time_unit, output_dir)

# Time units setup: (time_unit, time_vars, x_labels)
time_units = [
    ('hour', [f'hour{i}' for i in range(1, 24)], [f'{i}' for i in range(1, 24)]),
    ('day', [f'day{i}' for i in range(2, 8)], ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']),
    ('week', [f'week{i}' for i in range(2, 54)], [f'{i}' for i in range(2, 54)]),
    ('year', [f'year{i}' for i in range(2020, 2025)], [f'{i}' for i in range(2020, 2025)])
]

# List of sites and super categories
sites = ['AD', 'ASH', 'EL', 'JT', 'LA', 'N1W', 'NM', 'TF', 'TMW', 'UGH'] 
super_categories = ['people', 'small_vehicles', 'large_vehicles', 'market', 'two_wheelers', 'refuse', 'animal'] 

# Plot heatmaps for each time unit
plot_heatmaps_by_time_unit(sites, super_categories, time_units)


  0%|                                                                                                                  | 0/10 [00:06<?, ?it/s]


ValueError: Shape of passed values is (10, 7), indices imply (10, 8)