# Clean Attendance files

The goal is to clean attendance files in order to extract information. We will complete information in the main kitchen file (kitchen_ids_cluster) but also develop a new excel file called cleaned_attendances.xslx to merge all the information


## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Code

### Clean attendances files month by month 
Due to different formats

In [None]:
# Open the kitchen files
kitchen_data = pd.read_excel('../output/kitchen_ids_cluster.xlsx')

In [None]:
# First focus on attendance of July
##Open the file
july_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet July 2024.xlsx')
## Extract name of kitchen with existing data
july_benef_data = july_attendance[['Code', 'Number of Beneficiaries ']]

## Now clean information in the july file
# Select columns 
july_attendance = july_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
july_attendance_melted = july_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')

# Convert the 'Date' column to datetime format
july_attendance_melted['Date'] = pd.to_datetime(july_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
july_attendance_melted['Year'] = july_attendance_melted['Date'].dt.year
july_attendance_melted['Month'] = july_attendance_melted['Date'].dt.month

# Reorder columns
final_july = july_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'nb_meals' is numeric, coercing errors to NaN
final_july['nb_meals'] = pd.to_numeric(final_july['nb_meals'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
merged_data = final_july.groupby(['Code'])['nb_meals'].mean().reset_index()

## Join both data 
july_benef_data = july_benef_data.merge(merged_data, on='Code', how='left')
july_benef_data = july_benef_data.rename(columns={"nb_meals": "benef", "Number of Beneficiaries ": "est_benef"})

## Add month and year 
july_benef_data['Month'] = 7
july_benef_data['Year'] = 2024

In [None]:
# Second focus on attendance of August
##Open the file
aug_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet AUG 2024.xlsx')

## Extract name of kitchen with existing data
aug_benef_data = aug_attendance[['Code', 'Number of Beneficiaries ']]

## Now clean information in the july file
# Select columns 
aug_attendance = aug_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
aug_attendance_melted = aug_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')

# Convert the 'Date' column to datetime format
aug_attendance_melted['Date'] = pd.to_datetime(aug_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
aug_attendance_melted['Year'] = aug_attendance_melted['Date'].dt.year
aug_attendance_melted['Month'] = aug_attendance_melted['Date'].dt.month

# Reorder columns
final_aug = aug_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'nb_meals' is numeric, coercing errors to NaN
final_aug['nb_meals'] = pd.to_numeric(final_aug['nb_meals'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
merged_data = final_aug.groupby(['Code'])['nb_meals'].mean().reset_index()

## Join both data 
aug_benef_data = aug_benef_data.merge(merged_data, on='Code', how='left')
aug_benef_data = aug_benef_data.rename(columns={"nb_meals": "benef", "Number of Beneficiaries ": "est_benef"})

## Add month and year 
aug_benef_data['Month'] = 8
aug_benef_data['Year'] = 2024

In [None]:
# Fourth focus on attendance of September -  2 files
##Open the file
sep_attendance_1 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Sep 2024.xlsx')
sep_attendance_2 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sep.xlsx')
## Some have been double enter
# Ensure column names are strings
sep_attendance_1.columns = sep_attendance_1.columns.map(str)
sep_benef_data = pd.DataFrame(sep_attendance_1['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in sep_attendance_1.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in sep_attendance_1.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    sep_attendance_1[date_col] = pd.to_numeric(sep_attendance_1[date_col], errors='coerce').fillna(np.nan)
    sep_attendance_1[unnamed_col] = pd.to_numeric(sep_attendance_1[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    sep_attendance_1[date_col] = sep_attendance_1[[date_col, unnamed_col]].max(axis=1)
    sep_benef_data[date_col] = sep_attendance_1[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
sep_attendance_1 = sep_attendance_1.drop(columns=unnamed_columns)
## Remove added columns
sep_attendance_2 = sep_attendance_2[sep_attendance_2.columns.drop(list(sep_attendance_2.filter(regex='Unnamed:')))]
sep_attendance_2 = sep_attendance_2[sep_attendance_2['No'].notna()]
sep_attendance_1 = sep_attendance_1[sep_attendance_1['No.'].notna()]
## Concat both
sep_attendance = pd.concat([sep_attendance_1,sep_attendance_2], axis=0, ignore_index=True)
sep_benef_data = pd.concat([sep_benef_data,sep_attendance_2], axis=0, ignore_index=True)

## Now clean information in the july file
est_benef = sep_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
sep_attendance = sep_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries ', 'No', 'Area'], axis=1)
# Melt the DataFrame to have 'Date' as a column
sep_attendance_melted = sep_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
sep_benef_data = sep_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
sep_attendance_melted['Date'] = pd.to_datetime(sep_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
sep_attendance_melted['Year'] = sep_attendance_melted['Date'].dt.year
sep_attendance_melted['Month'] = sep_attendance_melted['Date'].dt.month

# Reorder columns
final_sep = sep_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'nb_meals' is numeric, coercing errors to NaN
sep_benef_data['benef'] = pd.to_numeric(sep_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
sep_benef_data = sep_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
sep_benef_data = sep_benef_data.merge(est_benef, on='Code', how='left')
sep_benef_data = sep_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
sep_benef_data['Month'] = 9
sep_benef_data['Year'] = 2024

In [None]:
# Fifth focus on attendance of October -  2 files
##Open the file
oct_attendance_1 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Oct 2024.xlsx')
oct_attendance_2 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Oct.xlsx')

## Some have been double enter
# Ensure column names are strings
oct_attendance_1.columns = oct_attendance_1.columns.map(str)
oct_benef_data = pd.DataFrame(oct_attendance_1['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in oct_attendance_1.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in oct_attendance_1.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    oct_attendance_1[date_col] = pd.to_numeric(oct_attendance_1[date_col], errors='coerce').fillna(np.nan)
    oct_attendance_1[unnamed_col] = pd.to_numeric(oct_attendance_1[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    oct_attendance_1[date_col] = oct_attendance_1[[date_col, unnamed_col]].max(axis=1)
    # Check if both columns have at least one non-null value
    oct_benef_data[date_col] = oct_attendance_1[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
oct_attendance_1 = oct_attendance_1.drop(columns=unnamed_columns)
# Remove added columns
oct_attendance_2 = oct_attendance_2[oct_attendance_2.columns.drop(list(oct_attendance_2.filter(regex='Unnamed:')))]
oct_attendance_2 = oct_attendance_2[oct_attendance_2['No'].notna()]
oct_attendance_1 = oct_attendance_1[oct_attendance_1['No.'].notna()]
## Concat both
oct_attendance = pd.concat([oct_attendance_1,oct_attendance_2], axis=0, ignore_index=True)
oct_benef_data = pd.concat([oct_benef_data,oct_attendance_2], axis=0, ignore_index=True)

## Now clean information in the july file
est_benef = oct_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
oct_attendance = oct_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries ', 'No', 'Area'], axis=1)
# Melt the DataFrame to have 'Date' as a column
oct_attendance_melted = oct_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
oct_benef_data = oct_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
oct_attendance_melted['Date'] = pd.to_datetime(oct_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
oct_attendance_melted['Year'] = oct_attendance_melted['Date'].dt.year
oct_attendance_melted['Month'] = oct_attendance_melted['Date'].dt.month

# Reorder columns
final_oct = oct_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'benef' is numeric, coercing errors to NaN
oct_benef_data['benef'] = pd.to_numeric(oct_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
oct_benef_data = oct_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
oct_benef_data = oct_benef_data.merge(est_benef, on='Code', how='left')
oct_benef_data = oct_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
oct_benef_data['Month'] = 10
oct_benef_data['Year'] = 2024

In [None]:
# Sixth focus on attendance of November -  2 files
##Open the file
nov_attendance_1 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Nov 2024.xlsx')
nov_attendance_2 = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Nov.xlsx')

## Some have been double enter
# Ensure column names are strings
nov_attendance_1.columns = nov_attendance_1.columns.map(str)
nov_benef_data = pd.DataFrame(nov_attendance_1['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in nov_attendance_1.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in nov_attendance_1.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    nov_attendance_1[date_col] = pd.to_numeric(nov_attendance_1[date_col], errors='coerce').fillna(np.nan)
    nov_attendance_1[unnamed_col] = pd.to_numeric(nov_attendance_1[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    nov_attendance_1[date_col] = nov_attendance_1[[date_col, unnamed_col]].max(axis=1)
    nov_benef_data[date_col] = nov_attendance_1[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
nov_attendance_1 = nov_attendance_1.drop(columns=unnamed_columns)
# Remove added columns
nov_attendance_2 = nov_attendance_2[nov_attendance_2.columns.drop(list(nov_attendance_2.filter(regex='Unnamed:')))]
nov_attendance_2 = nov_attendance_2[nov_attendance_2['No'].notna()]
nov_attendance_1 = nov_attendance_1[nov_attendance_1['No.'].notna()]

## Remove some data
nov_attendance_1['Code'] = [code.replace("S","") for code in nov_attendance_1['Code']]
nov_attendance_1['Code'] = [code.replace(" ","") for code in nov_attendance_1['Code']]
nov_benef_data['Code'] = [str(code).replace("S","") for code in nov_benef_data['Code']]
nov_benef_data['Code'] = [str(code).replace(" ","") for code in nov_benef_data['Code']]

## Concat both
nov_attendance = pd.concat([nov_attendance_1,nov_attendance_2], axis=0, ignore_index=True)
nov_benef_data = pd.concat([nov_benef_data,nov_attendance_2], axis=0, ignore_index=True)

## Extarct estimation data
est_benef = nov_attendance[['Code', 'Number of Beneficiaries ']]

# Select columns 
nov_attendance = nov_attendance.drop(['No.', 'State', 'Localit', 'Locality', 'District / Area', 'Number of Beneficiaries ', 'No', 'Area'], axis=1)
# Melt the DataFrame to have 'Date' as a column
nov_attendance_melted = nov_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
nov_benef_data = nov_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
nov_attendance_melted['Date'] = pd.to_datetime(nov_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
nov_attendance_melted['Year'] = nov_attendance_melted['Date'].dt.year
nov_attendance_melted['Month'] = nov_attendance_melted['Date'].dt.month

# Reorder columns
final_nov = nov_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]


# Ensure 'benef' is numeric, coercing errors to NaN
nov_benef_data['benef'] = pd.to_numeric(nov_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
nov_benef_data = nov_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
nov_benef_data = nov_benef_data.merge(est_benef, on='Code', how='left')
nov_benef_data = nov_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
nov_benef_data['Month'] = 11
nov_benef_data['Year'] = 2024

In [None]:
# Sixth focus on attendance of November -  1 file
##Open the file
dec_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Dec 2024.xlsx')

# Ensure column names are strings
dec_attendance.columns = dec_attendance.columns.map(str)
dec_benef_data = pd.DataFrame(dec_attendance['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in dec_attendance.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in dec_attendance.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    dec_attendance[date_col] = pd.to_numeric(dec_attendance[date_col], errors='coerce').fillna(np.nan)
    dec_attendance[unnamed_col] = pd.to_numeric(dec_attendance[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    # Store the sum
    dec_attendance[date_col] = dec_attendance[[date_col, unnamed_col]].max(axis=1)
    dec_benef_data[date_col] = dec_attendance[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
dec_attendance = dec_attendance.drop(columns=unnamed_columns)

## Now clean information in the dec file
## Extarct estimation data
est_benef = dec_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
dec_attendance = dec_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
dec_attendance_melted = dec_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
dec_benef_data = dec_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
dec_attendance_melted['Date'] = pd.to_datetime(dec_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
dec_attendance_melted['Year'] = dec_attendance_melted['Date'].dt.year
dec_attendance_melted['Month'] = dec_attendance_melted['Date'].dt.month

# Reorder columns
final_dec = dec_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'benef' is numeric, coercing errors to NaN
dec_benef_data['benef'] = pd.to_numeric(dec_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
dec_benef_data = dec_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
dec_benef_data = dec_benef_data.merge(est_benef, on='Code', how='left')
dec_benef_data = dec_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
dec_benef_data['Month'] = 12
dec_benef_data['Year'] = 2024

In [None]:
# Sixth focus on attendance of November -  1 file
##Open the file
jan_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Jan 2025.xlsx')

# Ensure column names are strings
jan_attendance.columns = jan_attendance.columns.map(str)
jan_benef_data = pd.DataFrame(jan_attendance['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in jan_attendance.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in jan_attendance.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    jan_attendance[date_col] = pd.to_numeric(jan_attendance[date_col], errors='coerce').fillna(np.nan)
    jan_attendance[unnamed_col] = pd.to_numeric(jan_attendance[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    # Store the sum
    jan_attendance[date_col] = jan_attendance[[date_col, unnamed_col]].max(axis=1)
    jan_benef_data[date_col] = jan_attendance[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
jan_attendance = jan_attendance.drop(columns=unnamed_columns)

## Now clean information in the dec file
## Extarct estimation data
est_benef = jan_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
jan_attendance = jan_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
jan_attendance_melted = jan_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
jan_benef_data = jan_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
jan_attendance_melted['Date'] = pd.to_datetime(jan_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
jan_attendance_melted['Year'] = jan_attendance_melted['Date'].dt.year
jan_attendance_melted['Month'] = jan_attendance_melted['Date'].dt.month

# Reorder columns
final_jan = jan_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'benef' is numeric, coercing errors to NaN
jan_benef_data['benef'] = pd.to_numeric(jan_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
jan_benef_data = jan_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
jan_benef_data = jan_benef_data.merge(est_benef, on='Code', how='left')
jan_benef_data = jan_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
jan_benef_data['Month'] = 1
jan_benef_data['Year'] = 2025

In [None]:
# Sixth focus on attendance of November -  1 file
##Open the file
fev_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Feb 2025.xlsx')

# Ensure column names are strings
fev_attendance.columns = fev_attendance.columns.map(str)
fev_benef_data = pd.DataFrame(fev_attendance['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in fev_attendance.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in fev_attendance.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    fev_attendance[date_col] = pd.to_numeric(fev_attendance[date_col], errors='coerce').fillna(np.nan)
    fev_attendance[unnamed_col] = pd.to_numeric(fev_attendance[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    # Store the sum
    fev_attendance[date_col] = fev_attendance[[date_col, unnamed_col]].max(axis=1)
    fev_benef_data[date_col] = fev_attendance[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
fev_attendance = fev_attendance.drop(columns=unnamed_columns)

## Now clean information in the dec file
## Extarct estimation data
est_benef = fev_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
fev_attendance = fev_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
fev_attendance_melted = fev_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
fev_benef_data = fev_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
fev_attendance_melted['Date'] = pd.to_datetime(fev_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
fev_attendance_melted['Year'] = fev_attendance_melted['Date'].dt.year
fev_attendance_melted['Month'] = fev_attendance_melted['Date'].dt.month

# Reorder columns
final_fev = fev_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'benef' is numeric, coercing errors to NaN
fev_benef_data['benef'] = pd.to_numeric(fev_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
fev_benef_data = fev_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
fev_benef_data = fev_benef_data.merge(est_benef, on='Code', how='left')
fev_benef_data = fev_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
fev_benef_data['Month'] = 2
fev_benef_data['Year'] = 2025

In [None]:
# Sixth focus on attendance of November -  1 file
##Open the file
mar_attendance = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Daily Ops Jul-Mar/Attendance Sheet Mar2025.xlsx')

# Ensure column names are strings
mar_attendance.columns = mar_attendance.columns.map(str)
mar_benef_data = pd.DataFrame(mar_attendance['Code'])

# Identify date columns and their corresponding "Unnamed" columns
date_columns = [col for col in mar_attendance.columns if 'Unnamed' not in col and 'No.' not in col and 'Code' not in col and 'State' not in col and 'Locality' not in col and 'District / Area' not in col and 'Number of Beneficiaries ' not in col]
unnamed_columns = [col for col in mar_attendance.columns if 'Unnamed:' in col]

# Create a mapping of date columns to their corresponding "Unnamed" columns
column_pairs = [(date, unnamed) for date, unnamed in zip(date_columns, unnamed_columns)]

# Convert relevant columns to numeric, handling errors (non-numeric entries become NaN)
for date_col, unnamed_col in column_pairs:
    mar_attendance[date_col] = pd.to_numeric(mar_attendance[date_col], errors='coerce').fillna(np.nan)
    mar_attendance[unnamed_col] = pd.to_numeric(mar_attendance[unnamed_col], errors='coerce').fillna(np.nan)

# Sum the date columns with their corresponding "Unnamed" columns
for date_col, unnamed_col in column_pairs:
    # Store the sum
    mar_attendance[date_col] = mar_attendance[[date_col, unnamed_col]].max(axis=1)
    mar_benef_data[date_col] = mar_attendance[[date_col, unnamed_col]].mean(axis=1)

# # Drop the "Unnamed" columns after summing
mar_attendance = mar_attendance.drop(columns=unnamed_columns)

## Now clean information in the dec file
## Extarct estimation data
est_benef = mar_attendance[['Code', 'Number of Beneficiaries ']]
# Select columns 
mar_attendance = mar_attendance.drop(['No.', 'State', 'Locality', 'District / Area', 'Number of Beneficiaries '], axis=1)
# Melt the DataFrame to have 'Date' as a column
mar_attendance_melted = mar_attendance.melt(id_vars=['Code'], var_name='Date', value_name='nb_meals')
mar_benef_data = mar_benef_data.melt(id_vars=['Code'], var_name='Date', value_name='benef')

# Convert the 'Date' column to datetime format
mar_attendance_melted['Date'] = pd.to_datetime(mar_attendance_melted['Date'])

# Extract Year and Month from the 'Date' column
mar_attendance_melted['Year'] = mar_attendance_melted['Date'].dt.year
mar_attendance_melted['Month'] = mar_attendance_melted['Date'].dt.month

# Reorder columns
final_mar = mar_attendance_melted[['Code', 'Date', 'Month', 'Year', 'nb_meals']]

# Ensure 'benef' is numeric, coercing errors to NaN
mar_benef_data['benef'] = pd.to_numeric(mar_benef_data['benef'], errors='coerce')

## In this case the nb_meals = nb_benefeciaries - because only one values
mar_benef_data = mar_benef_data[['Code', 'benef']].groupby(['Code']).mean().reset_index()

## Merge to have the estimated benef
mar_benef_data = mar_benef_data.merge(est_benef, on='Code', how='left')
mar_benef_data = mar_benef_data.rename(columns={"Number of Beneficiaries ": "est_benef"})

## Add month and year 
mar_benef_data['Month'] = 3
mar_benef_data['Year'] = 2025

print(len(final_mar))

In [None]:
## Join the different final data 
final_data = pd.concat([final_july, final_aug, final_sep, 
                        final_oct, final_nov, final_dec, 
                        final_jan, final_fev, final_mar], axis=0, ignore_index=True)
final_data = final_data.rename(columns={"Code": "kitchen_code"})
final_data = final_data.drop_duplicates()
final_data = final_data[final_data['kitchen_code'].notna()]
final_data['kitchen_code'] = [code.replace("/MA/","/JA/") for code in final_data['kitchen_code']]
final_data.to_excel('../output/attendances_cleaned.xlsx', index = False)

In [None]:
## Join the different final data 
final_data = pd.concat([july_benef_data, aug_benef_data, sep_benef_data, 
                        oct_benef_data, nov_benef_data, dec_benef_data, 
                       jan_benef_data, fev_benef_data, mar_benef_data], axis=0, ignore_index=True)
final_data = final_data.rename(columns={"Code": "kitchen_code"})
final_data = final_data.drop_duplicates()
final_data = final_data[final_data['kitchen_code'].notna()]
final_data['kitchen_code'] = [code.replace("/MA/","/JA/") for code in final_data['kitchen_code']]
##Remove data still not open
##Kitchen not in service
kitchen_to_be_removed = ['KH/JA/184', 'KH/JA/185', 'KH/JA/186', 'KH/JA/187', 'KH/JA/188', 'KH/JA/189', 'KH/JA/190', 'KH/JA/191',
                         'KH/JA/192', 'KH/JA/193', 'KH/JA/194', 'KH/JA/195', 'KH/JA/196', 'KH/JA/197', 'KH/JA/198', 'KH/JA/199',
                         'KH/JA/200', 'KH/JA/201', 'KH/JA/202', 'KH/JA/203', 'KH/JA/204', 'KH/JA/205', 'KH/JA/206', 'KH/JA/207',
                         'KH/JA/208', 'KH/JA/209', 'KH/JA/210', 'KH/JA/211', 'KH/JA/212', 'KH/JA/213', 'KH/JA/214', 'KH/JA/215',
                         'KH/JA/216', 'KH/JA/217', 'KH/JA/218', 'KH/JA/219', 'KH/JA/220', 'KH/JA/221', 'KH/JA/222', 'KH/JA/223',
                         'KH/JA/224', 'KH/JA/225', 'KH/JA/226', 'KH/JA/227', 'KH/JA/228', 'KH/JA/229', 'KH/JA/230', 'KH/JA/231',
                         'KH/JA/232', 'KH/JA/233', 'KH/JA/234', 'KH/JA/235', 'KH/JA/236', 'KH/JA/237', 'KH/JA/135', 'KH/JA/136',
                         'KH/JA/137', 'KH/JA/138', 'KH/JA/139', 'KH/JA/140', 'KH/JA/141', 'KH/JA/142', 'KH/JA/143', 'KH/JA/148']

# Remocve then from this list
final_data = final_data[~final_data['kitchen_code'].isin(kitchen_to_be_removed)]
final_data.to_excel('../output/benef_over_time.xlsx', index = False)

## Clean incidents docuement

In [None]:
## Read the incident file - manually filled
df_incident = pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Incidents/incidence_report_summary.xlsx')
### Read ids_cluster 
kitchen_ids_cluster = pd.read_excel('../output/kitchen_ids_cluster.xlsx')
##Only select the ones when the kitchen operations was hold
df_incident = df_incident.loc[df_incident['operations_hold'] == 'yes']
# Read final data excel file 
df_attendances = pd.read_excel('../output/attendances_cleaned.xlsx')

## Now the goal is if the attences numbers are empty for the kitchen code between the two dates - we can fill the gap by 0 as no food deliveries where available


In [None]:
# Ensure date columns are in datetime format
df_attendances['Date'] = pd.to_datetime(df_attendances['Date'])
df_incident['incidence_date'] = pd.to_datetime(df_incident['incidence_date'])
df_incident['end_date_hold'] = pd.to_datetime(df_incident['end_date_hold'])

# Merge the datasets on 'kitchen_id'
df_merged = df_attendances.merge(df_incident, on='kitchen_code', how='left')

# Check if the date is within the hold period
df_merged['on_hold'] = (df_merged['Date'] >= df_merged['incidence_date']) & \
                       (df_merged['Date'] <= df_merged['end_date_hold'])

# Update 'number_of_benef' to 0 if it's within the hold period and currently empty (NaN)
df_merged['nb_meals'] = df_merged.apply(
    lambda row: 0 if pd.isna(row['nb_meals']) and row['on_hold'] else row['nb_meals'], axis=1
)

# Drop unnecessary columns
df_result = df_merged[['kitchen_code', 'Date', 'Month', 'Year', 'nb_meals']]
df_result['kitchen_code'] = [code.replace("/MA/","/JA/") for code in df_result['kitchen_code']]
df_result = df_result.dropna(subset=['kitchen_code'])

df_result['nb_meals'] = df_result['nb_meals'].replace('\xa0', '0', regex=True).astype(float)
# Sort by 'nb_meals' to prioritize non-NaN rows
df_result = df_result.sort_values(by=['nb_meals'], na_position='last')

## Correct the number of benef for the kicthen KH/JA/159
df_result.loc[(df_result['Date'] > '2024-12-01') & (df_result['kitchen_code'] == 'KH/JA/159'), 'nb_meals'] /= 5

# Drop duplicates, keeping the first occurrence (non-NaN row)
df_result = df_result.drop_duplicates(subset=['kitchen_code', 'Date'], keep='first')
##Only complete the grid with kitchen with at least on date with information
unique_kitchen = df_result['kitchen_code'].unique()
unique_date = pd.date_range('2024-07-01','2025-04-01',freq='d')
complete_grid = pd.MultiIndex.from_product(
    [unique_date, unique_kitchen],
    names=["date", "kitchen_code"]
).to_frame(index=False)

complete_grid['Year'] = complete_grid['date'].dt.year
complete_grid['Month'] = complete_grid['date'].dt.month

df_result = pd.merge(complete_grid, df_result, left_on=['date', 'kitchen_code', 'Year', 'Month'], right_on=['Date', 'kitchen_code', 'Year', 'Month'], how='left')

df_result = df_result[['date', 'kitchen_code', 'Year', 'Month', 'nb_meals']]

##Kitchen not in service
kitchen_to_be_removed = ['KH/JA/184', 'KH/JA/185', 'KH/JA/186', 'KH/JA/187', 'KH/JA/188', 'KH/JA/189', 'KH/JA/190', 'KH/JA/191',
                         'KH/JA/192', 'KH/JA/193', 'KH/JA/194', 'KH/JA/195', 'KH/JA/196', 'KH/JA/197', 'KH/JA/198', 'KH/JA/199',
                         'KH/JA/200', 'KH/JA/201', 'KH/JA/202', 'KH/JA/203', 'KH/JA/204', 'KH/JA/205', 'KH/JA/206', 'KH/JA/207',
                         'KH/JA/208', 'KH/JA/209', 'KH/JA/210', 'KH/JA/211', 'KH/JA/212', 'KH/JA/213', 'KH/JA/214', 'KH/JA/215',
                         'KH/JA/216', 'KH/JA/217', 'KH/JA/218', 'KH/JA/219', 'KH/JA/220', 'KH/JA/221', 'KH/JA/222', 'KH/JA/223',
                         'KH/JA/224', 'KH/JA/225', 'KH/JA/226', 'KH/JA/227', 'KH/JA/228', 'KH/JA/229', 'KH/JA/230', 'KH/JA/231',
                         'KH/JA/232', 'KH/JA/233', 'KH/JA/234', 'KH/JA/235', 'KH/JA/236', 'KH/JA/237', 'KH/JA/135', 'KH/JA/136',
                         'KH/JA/137', 'KH/JA/138', 'KH/JA/139', 'KH/JA/140', 'KH/JA/141', 'KH/JA/142', 'KH/JA/143', 'KH/JA/148']

# Remocve then from this list
df_result = df_result[~df_result['kitchen_code'].isin(kitchen_to_be_removed)]

df_result.to_excel('../output/attendances_cleaned_completed.xlsx', index = False)

## Attendance Imputations due to closed kitchens

In [None]:
#Read Data
df_attand = pd.read_excel('../output/attendances_cleaned_completed.xlsx')
## Select only Jebel Awlia and Mayo area
df_attand = df_attand.loc[df_attand['kitchen_code'].str.contains('KH/JA|KH/MA')]

## Imput data
# Step 1: Sort by kitchen_code and date to ensure temporal order
df_attand = df_attand.sort_values(by=['kitchen_code', 'date'])

# Step 2: Forward fill missing values (ignore zeros)
df_attand['attendance_ffill'] = (
    df_attand.groupby('kitchen_code')['nb_meals']
    .transform(lambda x: x.replace(0, None).ffill())
)

# Step 3: Backward fill missing values (ignore zeros)
df_attand['attendance_bfill'] = (
    df_attand.groupby('kitchen_code')['nb_meals']
    .transform(lambda x: x.replace(0, None).bfill())
)

# Step 4: Combine forward and backward fill
df_attand['attendance_combined'] = df_attand['attendance_ffill'].fillna(
    df_attand['attendance_bfill']
)

# Step 5: Preserve explicit zeros
df_attand['est_benef'] = df_attand.apply(
    lambda row: 0 if row['nb_meals'] == 0 else row['attendance_combined'], axis=1
)

# Step 6: Drop intermediate columns
df_attand = df_attand.drop(columns=['attendance_ffill', 'attendance_bfill', 'attendance_combined'])

#Save the data
df_attand.to_excel('../output/attendances_completed_imputed.xlsx', index=False)

## Calculate percentage of attendacnes compared one to another

In [None]:
from itertools import product
#Read Data
df_attand = pd.read_excel('../output/attendances_completed_imputed.xlsx')
## Select only Jebel Awlia and Mayo area
df_attand = df_attand.loc[df_attand['kitchen_code'].str.contains('KH/JA|KH/MA')]

## Imput data
# Step 1: Sort by kitchen_code and date to ensure temporal order
df_attand = df_attand.sort_values(by=['kitchen_code', 'date'])

# Ensure the date column is in datetime format
df_attand['date'] = pd.to_datetime(df_attand['date'])

# Add a Month-Year column
df_attand['Month-Year'] = df_attand['date'].dt.to_period('M').astype(str)

# Group by Kitchen_ID and Month-Year, summing up attendance
monthly_data = df_attand.groupby(['kitchen_code', 'Month-Year'])['est_benef'].mean().reset_index()

monthly_data.to_excel('../output/attendances_comparison_kitchens.xlsx', index=False)

## Visualization

In [None]:
df_attand = pd.read_excel('../output/attendances_completed_imputed.xlsx')

In [None]:
### Beneficiary over time
# Aggregate nb_meals by date
nb_meals_time = df_attand.groupby('date')[['nb_meals', 'est_benef']].apply(lambda x: x.replace(0, np.nan).mean()).reset_index()

plt.figure(figsize=(12, 6))
plt.plot(nb_meals_time['date'].to_numpy(), nb_meals_time['nb_meals'].to_numpy())
#plt.plot(nb_meals_time['date'].to_numpy(), nb_meals_time['nb_meals'].to_numpy(), label = 'Simple data cleaning')
#plt.plot(nb_meals_time['date'].to_numpy(), nb_meals_time['est_benef'].to_numpy(), label = 'Data imputed')
plt.legend(title ='Type of cleaning')
plt.xlabel('Date')
plt.ylabel('Number of Meal served per day')
plt.grid(axis='x')
plt.xticks(rotation=45)
# Save the figure to a PNG file
plt.savefig('../visualization/nb_meals_trends_imputation.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Get unique kitchen codes
kitchen_codes = df_attand['kitchen_code'].unique()

# Determine the grid size for subplots
n_kitchens = len(kitchen_codes)
n_cols = 4  # Number of columns for the grid
n_rows = -(-n_kitchens // n_cols)  # Ceiling division for rows

# Create the subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows), sharey=False, sharex=True)

# Flatten the axes array for easier iteration
axes = axes.flatten()

# Plot data for each kitchen_code
for i, kitchen in enumerate(kitchen_codes):
    ax = axes[i]
    kitchen_data = df_attand[df_attand['kitchen_code'] == kitchen]
    ax.plot(kitchen_data['date'].to_numpy(), kitchen_data['est_benef'].to_numpy(), marker='x', label=kitchen)
    ax.plot(kitchen_data['date'].to_numpy(), kitchen_data['nb_meals'].to_numpy(), marker='o', label=kitchen, linestyle='None')
    
    ax.set_title(kitchen)
    ax.set_xlabel('Date')
    ax.set_ylabel('nb_meals')
    ax.tick_params(axis='x', rotation=45)

# Remove unused subplots if any
for i in range(n_kitchens, len(axes)):
    fig.delaxes(axes[i])

# Adjust layout and add a title
plt.tight_layout()

# Save the figure to a PNG file
plt.savefig('../visualization/kitchen_nb_meals_trends.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Group data per day by category
df_attand['category'] = df_attand['nb_meals'].apply(
    lambda x: 'Closed' if x == 0 else ('Opened' if not pd.isna(x) else 'No Info')
)

# Aggregate counts per category per date
daily_counts = df_attand.groupby(['date', 'category']).size().unstack(fill_value=0)

# Ensure stacking order: Opened (green) → Closed (red) → No Info (grey)
stack_order = ['Opened', 'Closed', 'No Info']
daily_counts = daily_counts[stack_order]  # Reorder columns

# Define colors for each category
category_colors = {'Opened': 'darkgreen', 'Closed': 'red', 'No Info': 'white'}

# Increase bar width and adjust x-axis labels
fig, ax = plt.subplots(figsize=(12, 4))

bars = daily_counts.plot(kind='bar', stacked=True, width=1.2,  # Thicker bars
                         color=[category_colors[col] for col in stack_order], 
                         ax=ax, edgecolor='white')

# Add a horizontal line at y = 111 (total number of kitchens available)
total_kitchens = 56
ax.axhline(y=total_kitchens, color='black', linestyle='dashed', linewidth=1.5, label='Maximum available kitchen \n in Jebel Awlia')

# Set x-axis ticks to every 10th date
ax.set_xticks(range(0, len(daily_counts), 10))
ax.set_xticklabels(daily_counts.index[::10].strftime('%Y-%m-%d'), rotation=45, fontsize=9)

# Labels and Formatting
ax.set_xlabel('Date', fontsize=10)
ax.set_ylabel('Number of Kitchens', fontsize=10)
#ax.set_title('Daily Kitchen Status in Jebel Awlia', fontsize=12)
ax.legend(title='Kitchen Status', loc='upper right', fontsize=10)

plt.yticks(fontsize=9)
plt.grid(axis='y', linestyle='dashed', alpha=0.5)

plt.tight_layout()
plt.savefig('../visualization/kitchen_status_barplot.png', dpi=300)
plt.show()


In [None]:
# Check for NaN values to determine colors
df_attand['color'] = df_attand['nb_meals'].apply(
    lambda x: 'red' if x == 0 else ('darkgreen' if not pd.isna(x) else 'white')
)

# Prepare the plot
fig, ax = plt.subplots(figsize=(9, 7))

# Loop through each kitchen_code
for kitchen in df_attand['kitchen_code'].unique():
    kitchen_data = df_attand[df_attand['kitchen_code'] == kitchen]
    ax.scatter(
        kitchen_data['date'],
        [kitchen] * len(kitchen_data),
        c=kitchen_data['color'],
        s=10,
        edgecolor='white'
    )

# Set x-axis ticks to every 5th date
all_dates = pd.date_range(df_attand['date'].min(), df_attand['date'].max(), freq='D')
reduced_dates = all_dates[::5]  # Select every 5th date

ax.set_xlabel('Date', fontsize=9)
ax.set_ylabel('Kitchen Code (of existing kitchens in Jebel Awlia)', fontsize=9)
ax.set_yticks(df_attand['kitchen_code'].unique())
#ax.set_title('Information Availability by Kitchen and Date', fontsize=14)
ax.set_xticks(reduced_dates)
ax.set_xticklabels(reduced_dates.strftime('%Y-%m-%d'), rotation=45)
ax.tick_params(axis='x', labelsize=8)
ax.tick_params(axis='y', labelsize=8, rotation=30)
plt.tight_layout()
plt.savefig('../visualization/attendance_jebel_awlia.png', dpi=300)
# Show the plot
plt.show()

## Visualization benefeciary over time + estimated benef

In [None]:
df_benef = pd.read_excel('../output/benef_over_time.xlsx')
df_benef = df_benef.dropna(subset=['kitchen_code'])
## Select only JA and MA
df_benef_JA  = df_benef.loc[df_benef['kitchen_code'].str.contains('KH/JA|KH/MA')]

In [None]:
df_benef = pd.read_excel('../output/benef_over_time.xlsx')
df_benef = df_benef.dropna(subset=['kitchen_code'])
## Select only JA and MA
df_benef_JA  = df_benef.loc[df_benef['kitchen_code'].str.contains('KH/JA|KH/MA')]

# ## Create data:
df_benef_JA['Month_Half'] = pd.to_datetime(df_benef_JA['Year'].astype(str) + 
                                           '-' + df_benef_JA['Month'].astype(str) + '-01', format='%Y-%m-%d')

df_benef_JA = df_benef_JA.groupby('Month_Half').agg({
    'benef': 'mean',
    'est_benef': 'mean',
    'kitchen_code': 'nunique'  # Count unique kitchens
}).reset_index()

In [None]:
df_attand = pd.read_excel('../output/attendances_completed_imputed.xlsx')
# Generate the start date for each month (assuming 'Month' is in 'YYYY-MM' format)
start_dates = pd.to_datetime(df_attand['Year'].astype(str) + '-' + 
                             df_attand['Month'].astype(str) + '-01', format='%Y-%m-%d')  # Start of the month
# Create a 'Half' column: 1 for days 1-15, 2 for days 16-end of month
df_attand['Half'] = df_attand['date'].dt.day.apply(lambda x: 1 if x <= 15 else 2)

# Half-months column (1 for first half, 2 for second half)
half_months = df_attand['Half']

# Calculate the mid of the month (15th for the second half)
dates = start_dates + pd.to_timedelta((half_months - 1) * 15, unit='D')

#Convert Month numbers to strings
months_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 
              7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

df_attand['Month_str'] = df_attand['Month'].map(months_map)

# Create a Half-Month Column
df_attand['Month_Half'] = start_dates + pd.to_timedelta((half_months - 1) * 15, unit='D')

# Aggregate benef by half-month
benef_time_stats = df_attand.groupby(['Month', 'Month_Half'])['est_benef'].agg(['mean', 'std', 'count']).reset_index()
benef_time_stats['ci_upper'] = benef_time_stats['mean'] + 1.96 * (benef_time_stats['std'] / np.sqrt(benef_time_stats['count']))
benef_time_stats['ci_lower'] = benef_time_stats['mean'] - 1.96 * (benef_time_stats['std'] / np.sqrt(benef_time_stats['count']))
benef_time_stats = benef_time_stats.sort_values('Month_Half')

In [None]:
# Plot
fig, ax1 = plt.subplots(figsize=(9, 4))

# Plot benef and est_benef
ax1.plot(benef_time_stats['Month_Half'].to_numpy(), benef_time_stats['mean'].to_numpy(), 
         label='Actual avg. daily people served per kitchen', color='cornflowerblue', linestyle='--', marker='x')
ax1.fill_between(benef_time_stats['Month_Half'].to_numpy(), benef_time_stats['ci_lower'].to_numpy(), benef_time_stats['ci_upper'].to_numpy(), 
                 color='cornflowerblue', alpha=0.2, label='95% confidence interval')

#ax1.plot(benef_time['Month_str'].to_numpy(), benef_time['benef'].to_numpy(), label='Actual number of people served per day', marker='o')
ax1.plot(df_benef_JA['Month_Half'].to_numpy(), df_benef_JA['est_benef'].to_numpy(), 
         label='Expected daily people served per kitchen', marker='s')

ax1.set_xlabel('Month')
ax1.set_ylabel('Daily benefeciaries served per kitchen')
ax1.grid(axis='x')

# Secondary axis for kitchens open
ax2 = ax1.twinx()
ax2.plot(df_benef_JA['Month_Half'].to_numpy(), df_benef_JA['kitchen_code'].to_numpy(), 
         label='Number of kitchens providing data on \n daily number of expected attendances', color='gray', 
         linestyle='dashed', marker='^')
ax2.set_ylabel('Number of Reporting Kitchens')

# Add legends
ax1.legend(loc='center', bbox_to_anchor=(0.2, 1.15))
ax2.legend(loc='center', bbox_to_anchor=(0.8, 1.15))

plt.xticks(rotation=45)
#plt.title("Meals Served & Kitchens Open Over Time")
plt.savefig('../visualization/benef_over_time.jpg', dpi=700, bbox_inches='tight')
plt.show()
