In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Define vehicle categories
vehicle_categories = ['car', 'trotro', 'truck', 'motorcycle', 'lorry', 'bus', 'taxi', 'van', 'bicycle']

# Define grid layout
nrows, ncols = 4, 3
fig, axes = plt.subplots(nrows, ncols, figsize=(15, 15))
fig.suptitle('Vehicle Proportions by Hour by Site', fontsize=16)

# Flatten the axes array for easy access
axes = axes.flatten()

# Define a color for each vehicle category
colors = [plt.cm.viridis(i/len(vehicle_categories)) for i in range(len(vehicle_categories))]

# Plot data for each site
for i, (site, site_details) in enumerate(site_type.items()):
    ax = axes[i] if i < (nrows * ncols - 1) else axes[-1]
    
    # Prepare data for plotting
    site_data = hourly_averages[hourly_averages['site_id'] == site]
    site_data = site_data.set_index('datetime')
    
    # Filter columns that match vehicle categories
    vehicle_data_by_hour = site_data[[cat+"_counts" for cat in site_data.columns if cat in vehicle_categories]]
    
    # Group by the hour and calculate the mean for each vehicle category
    vehicle_data_by_hour = vehicle_data_by_hour.groupby(site_data.index.hour).mean()
    
    # Normalize the data to sum to 1 across vehicle categories for each hour
    proportions = vehicle_data_by_hour.div(vehicle_data_by_hour.sum(axis=1), axis=0)
    
    # Initialize bar_bottom for each site
    bar_bottom = np.zeros(24)
    
    # Plot stacked bars for each vehicle category
    for idx, vehicle_cat in enumerate(vehicle_categories):
        # Select the data for the current vehicle category and fill NaNs with zero for missing hours
        counts = proportions[vehicle_cat+"_counts"].reindex(range(24)).fillna(0).values
        
        # Stack the bar for the current vehicle category
        ax.bar(range(24), counts, bottom=bar_bottom, color=colors[idx], label=vehicle_cat, width=1)
        
        # Update bar_bottom for the next category
        bar_bottom += counts
    
    ax.set_title(site_details[0])
    ax.set_xlim(0, 23)
    ax.set_xticks(range(24))
    ax.set_xticklabels([f'{hour}:00' for hour in range(24)], rotation=90)
    ax.grid(True, which='major', axis='x', linestyle='-', linewidth=0.5)
    ax.grid(True, which='major', axis='y', linestyle='-', linewidth=0.5)
    
    if i >= nrows * (ncols - 1):  # These plots will be on the bottom and should display x-ticks
        ax.tick_params(labelbottom=True)
    else:
        ax.tick_params(labelbottom=False)

# Remove empty subplots
for j in range(i + 1, nrows * ncols):
    fig.delaxes(axes[j])

# Create a single legend at the bottom of the figure
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=len(vehicle_categories)//2 + 1, bbox_to_anchor=(0.5, -0.02), fontsize=10)

plt.tight_layout()
plt.subplots_adjust(top=0.92, bottom=0.15)  # Adjust bottom to make space for the legend

plt.savefig(f'./results/diurnal/vehicle_proportions.png')
# plt.close(fig)  # Close the figure to avoid display issues


In [None]:
# Step 1: Create a new DataFrame to avoid SettingWithCopyWarning
object_data = fixed_object_data.copy()

# Add a column for the week of the year and the year
object_data['week_of_year'] = object_data['datetime'].dt.isocalendar().week
object_data['year'] = object_data['datetime'].dt.isocalendar().year

# Step 2: Sum the counts within each week for each camera at each site
weekly_counts = object_data.groupby(['site_id', 'camera_name', 'year', 'week_of_year'])[count_cols].sum().reset_index()

# Step 3: Group by 'site_id', 'year', and 'week_of_year', then calculate the mean for each object category
weekly_averages = weekly_counts.groupby(['site_id', 'year', 'week_of_year'])[count_cols].mean().reset_index()

# Step 4: Calculate mean and standard deviation for each site and year
mean_std_by_site_year = weekly_averages.groupby(['site_id', 'year'])[count_cols].agg(['mean', 'std'])

# Flatten the multi-index in columns
mean_std_by_site_year.columns = ['_'.join(col).strip() for col in mean_std_by_site_year.columns.values]
mean_std_by_site_year.reset_index(inplace=True)

# Define grid layout
nrows, ncols = 4, 3

# Create the plots for each super-category
for super_cat, categories in super_categories.items():
    # Correct column references
    count_columns = [cat + '_counts' for cat in categories]

    # Initialize the plot for this super-category
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 15))
    fig.suptitle(super_cat.capitalize().replace("_", " ") + ' - Weekly Trend by Site', fontsize=16)

    # Flatten the axes array for easy access and plotting
    axes = axes.flatten()

    # Remove the subplot that will be empty (bottom left)
    fig.delaxes(axes[-3])
    axes[-3] = None

    # Plot data for each site
    for i, (site, site_details) in enumerate(site_type.items()):
        ax = axes[i] if i < 9 else axes[10]  # Use existing axes for the 10th plot
    
        # Prepare data for plotting
        site_weekly_data = weekly_averages[weekly_averages['site_id'] == site]
        
        # Plot a z-normalized line for each year's data
        for year in site_weekly_data['year'].unique():
            year_data = site_weekly_data[site_weekly_data['year'] == year]
            mean_values = mean_std_by_site_year.loc[(mean_std_by_site_year['site_id'] == site) & 
                                                    (mean_std_by_site_year['year'] == year), 
                                                    [col + '_mean' for col in count_columns]].values.squeeze()
            std_values = mean_std_by_site_year.loc[(mean_std_by_site_year['site_id'] == site) & 
                                                   (mean_std_by_site_year['year'] == year), 
                                                   [col + '_std' for col in count_columns]].values.squeeze()
            
            # Normalize each week's counts and plot
            z_scores = (year_data[count_columns] - mean_values) / std_values
            ax.plot(year_data['week_of_year'], z_scores.mean(axis=1), label=f'{year}')

        ax.set_title(site_details[0])
        ax.set_xlim(1, 52)
        ax.legend()

        # Draw grid lines for every week
        ax.grid(True, which='major', axis='x', linestyle='-', linewidth=0.5)
        ax.grid(True, which='major', axis='y', linestyle='-', linewidth=0.5)

        # Manage x-ticks visibility for specified subplots
        if i in [6, 8, 9]:  # Indices for subplots that should show x-ticks
            ax.set_xticks(np.arange(1, 53, 1))  # Adjusted for weekly ticks
            ax.tick_params(labelbottom=True)
        else:
            ax.set_xticks(np.arange(1, 53, 1))  # Adjusted for weekly ticks
            ax.tick_params(labelbottom=False)

    # Remove the last (empty) subplot from the grid
    fig.delaxes(axes[-1])
    axes[-1] = None

    # Adjust layout and save the figure
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.savefig(f'./results/{super_cat}_weekly_znormalized.png')
    # plt.close(fig)  # Close the figure to avoid display issues


In [None]:
# Step 1: Create a new DataFrame to avoid SettingWithCopyWarning
object_data = fixed_object_data.copy()

# Add a column for the week of the year and the year
object_data['week_of_year'] = object_data['datetime'].dt.isocalendar().week
object_data['year'] = object_data['datetime'].dt.isocalendar().year

# Sum counts for each supercategory
for super_cat, categories in super_categories.items():
    object_data[super_cat + '_counts'] = object_data[[cat + '_counts' for cat in categories]].sum(axis=1)

# Add a column for the month of the year
object_data['month_of_year'] = object_data['datetime'].dt.month

# Step 2: Group and sum the counts within each month for each site
monthly_counts = object_data.groupby(['site_id', 'year', 'month_of_year']).agg(
    {super_cat + '_counts': 'sum' for super_cat in super_categories.keys()}
).reset_index()

# Calculate total counts per supercategory for each site across all time
total_counts_per_site = monthly_counts.groupby(['site_id']).agg(
    {super_cat + '_counts': 'sum' for super_cat in super_categories.keys()}
).reset_index()

# Merge the monthly averages with the total counts to get the share
monthly_shares = monthly_counts.merge(total_counts_per_site, on='site_id', suffixes=('', '_total'))

# Calculate the monthly share for each supercategory
for super_cat in super_categories.keys():
    monthly_shares[super_cat + '_share'] = monthly_shares[super_cat + '_counts'] / monthly_shares[super_cat + '_counts_total']

# Define grid layout for plotting
nrows, ncols = 4, 3

# Plotting
for super_cat in super_categories.keys():
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 15))
    fig.suptitle(f'{super_cat.capitalize().replace("_", " ")} - Monthly Share by Site', fontsize=16)

    axes = axes.flatten()

    for i, (site, site_details) in enumerate(site_type.items()):
        ax = axes[i] if i < (nrows * ncols - 1) else axes[-1]

        # Initialize an empty list to collect all max values across years
        all_max_values = []

        for year in monthly_shares['year'].unique():
            yearly_data = monthly_shares[(monthly_shares['site_id'] == site) & (monthly_shares['year'] == year)]

            if not yearly_data[super_cat + '_share'].isnull().all():  # Check if all values are NaN
                ax.plot(yearly_data['month_of_year'], yearly_data[super_cat + '_share'], label=f'Year {year}')
                # Collect all non-NaN max values
                max_value = yearly_data[super_cat + '_share'].max()
                if pd.notnull(max_value):
                    all_max_values.append(max_value)

        # Set y-limit based on the max value found across all years
        if all_max_values:  # Check if the list is not empty
            ax.set_ylim(0, max(all_max_values) * 1.1)

        ax.set_title(f'{site} - {super_cat}')
        ax.set_xlim(1, 12)
        ax.set_xticks(range(1, 13))
        # When setting x-tick labels
        ax.set_xticklabels([calendar.month_abbr[m] for m in range(1, 13)])
        ax.legend()
        ax.grid(True)

        if i >= nrows * (ncols - 1):  # These plots will be on the bottom and should display x-ticks
            ax.tick_params(labelbottom=True)
        else:
            ax.tick_params(labelbottom=False)

    for j in range(i + 1, nrows * ncols):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.savefig(f'./results/monthly_plots/{super_cat}_monthly_share.png')
    # plt.close(fig)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Create a new DataFrame to avoid SettingWithCopyWarning
object_data = fixed_object_data.copy()

# Add a column for the week of the year and the year
object_data['week_of_year'] = object_data['datetime'].dt.isocalendar().week
object_data['year'] = object_data['datetime'].dt.isocalendar().year

# Sum counts for each supercategory
for super_cat, categories in super_categories.items():
    object_data[super_cat + '_counts'] = object_data[[cat + '_counts' for cat in categories]].sum(axis=1)

# Step 2: Group and sum the counts within each week for each camera at each site
weekly_counts = object_data.groupby(['site_id', 'year', 'week_of_year']).agg(
    {super_cat + '_counts': 'sum' for super_cat in super_categories.keys()}
).reset_index()

# Calculate total counts per supercategory for each site and year
total_counts_per_year_site = weekly_counts.groupby(['site_id', 'year']).agg(
    {super_cat + '_counts': 'sum' for super_cat in super_categories.keys()}
).reset_index()

# Calculate the weekly share for each supercategory as a portion of the total per year, per site
for super_cat in super_categories.keys():
    # Join with the yearly totals
    weekly_counts = weekly_counts.merge(
        total_counts_per_year_site[['site_id', 'year', super_cat + '_counts']],
        on=['site_id', 'year'],
        suffixes=('', '_yearly_total')
    )
    
    # Calculate the proportion
    weekly_counts[super_cat + '_share'] = weekly_counts[super_cat + '_counts'] / weekly_counts[super_cat + '_counts_yearly_total']

# Filter out weeks without complete data
weekly_counts = weekly_counts.dropna(subset=[super_cat + '_share' for super_cat in super_categories.keys()])

# Define grid layout for plotting
nrows, ncols = 4, 3

# Plotting
for super_cat in super_categories.keys():
    fig, axes = plt.subplots(nrows, ncols, figsize=(15, 15))
    fig.suptitle(f'{super_cat.capitalize().replace("_", " ")} - Weekly Share by Site and Year', fontsize=16)

    axes = axes.flatten()

    for i, (site, site_details) in enumerate(site_type.items()):
        ax = axes[i] if i < (nrows * ncols - 1) else axes[-1]
        
        for year in weekly_counts['year'].unique():
            yearly_data = weekly_counts[(weekly_counts['site_id'] == site) & (weekly_counts['year'] == year)]
            if not yearly_data.empty:  # Check if there is data for the year
                ax.plot(yearly_data['week_of_year'], yearly_data[super_cat + '_share'], label=f'Year {year}')

        ax.set_title(f'{site} - {super_cat}')
        ax.set_xlim(1, 53)
        # ax.set_ylim(0, weekly_counts[super_cat + '_share'].max() * 1.1)  # Adjust the limit based on the max share
        ax.set_ylim(0, 0.1)  # Adjust the limit based on the max share
        ax.legend()
        ax.grid(True)

        if i >= nrows * (ncols - 1):  # These plots will be on the bottom and should display x-ticks
            ax.tick_params(labelbottom=True)
        else:
            ax.tick_params(labelbottom=False)

    for j in range(i + 1, nrows * ncols):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.savefig(f'./results/{super_cat}_weekly_share_by_year.png')
    # plt.close(fig)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

# Define the correlation categories
corr_categories = ['people', 'two_wheelers', 'small_vehicles', 'large_vehicles', 'market', 'refuse', 'animal']

# Concatenate data from all sites and cameras
multi_frame = []
for site in site_cams:
    for cam in site_cams[site]:
        multi_frame.append(fixed_object_data.loc[(fixed_object_data['site_id'] == site) & (fixed_object_data['camera'] == cam), [col + '_counts' for col in corr_categories]])

corr_frame = pd.concat(multi_frame)

# Rename columns to remove '_counts'
corr_frame.columns = [col.replace('_counts', '') for col in corr_frame.columns]

# Calculate correlation coefficients
method = 'pearson'
corr_matrix = corr_frame.astype(float).corr(method=method)

# Calculate p-values
p_matrix = pd.DataFrame(columns=corr_categories, index=corr_categories)
for i in range(len(corr_categories)):
    for j in range(i, len(corr_categories)):
        corr, pval = pearsonr(corr_frame[corr_categories[i]], corr_frame[corr_categories[j]])
        p_matrix.iloc[i, j] = pval
        p_matrix.iloc[j, i] = pval

# Create labels with correlation coefficients and p-values
labels = np.asarray(["{:.3f}\n({:.5f})".format(corr, pval) for corr, pval in zip(corr_matrix.values.flatten(), p_matrix.values.flatten())]).reshape(len(corr_categories), len(corr_categories))

# Replace p-values of 0.00000 with <0.00001
labels = np.where(labels == "(0.00000)", "(<0.00001)", labels)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# Create the heatmap
fig, ax = plt.subplots(figsize=(10, 10))
cmap = 'seismic'
corr_plot = sns.heatmap(corr_matrix, annot=labels, fmt='', cmap=plt.get_cmap(cmap), vmin=-1, vmax=1, cbar=False, ax=ax, mask=mask)

# Set x and y labels
ax.set_xticklabels([obj.replace('market', 'market-related').capitalize().replace("_", " ") for obj in corr_categories], rotation="vertical")
ax.set_yticklabels([obj.replace('market', 'market-related').capitalize().replace("_", " ") for obj in corr_categories], rotation="horizontal")

# Adjust the y-axis limits to make the plot square
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)

plt.tight_layout()
plt.show()

# Save the plot
fig_type = "images_supercat_final"
corr_plot.get_figure().savefig(f'results/{fig_type}_corr_{method}.png', format='png', bbox_inches='tight', pad_inches=0.0)