Please try to extract: 

•	Date purchased
•	Type of food 
•	Quantities (in Kg or L) 
•	Unit price  
•	Total cost 
•	Number of intended beneficiaries*
•	Date range distributed* 


In [None]:
from openpyxl import load_workbook
import pandas as pd
import numpy as np
import datetime
import math
import re
import pathlib
from dateutil import parser
import pdfplumber
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
import calendar

## Functions

In [None]:
## Return: kitchen_ID + kitchen_name
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def replace_slash_inside_parentheses(text):
        return re.sub(r'\(([^)]*)\)', lambda m: '(' + m.group(1).replace('/', '-') + ')', text)
    
def extract_kitchen_ID_Name(form, kitchen_ids_cluster):
    """
    Extract food-related text information from the provided form data..

    Parameters:
    ----------
    form: list 
        Corresponds to food_name - information containing form details such as name and date from the pdf invoice

    kitchen_ids_cluster: list
        Information containing in the excel document linking ids and clusters

    Return:
    ---------
    kitchen_name: str
        Name to save the form in a excel format - could be the real name of the kictchen or the ID

    ID: list
        list of all the kitchen ids contained in the form/invoice
    """
    # -  First : Extract Kitchen ID
    # Check for the word "kitchen ID"
    matches = form.apply(lambda x: x.astype(str).str.contains(r'\bKitchen ID\b', case=False, na=False))
    # Extract row and column indices
    locations = [(row_idx, col_idx) for row_idx, col in matches.iterrows() for col_idx, match in enumerate(col) if match]
    kitchen_ID = form.loc[locations[0][0]].iloc[locations[0][1] + 1]
    
    # -  Second : Extract Kitchen Name
    # Check for the word "kitchen Name"
    matches = form.apply(lambda x: x.astype(str).str.contains(r'\bKitchen Name\b', case=False, na=False))
    # Extract row and column indices
    locations = [(row_idx, col_idx) for row_idx, col in matches.iterrows() for col_idx, match in enumerate(col) if match]
    kitchen_name = form.loc[locations[0][0]].iloc[locations[0][1] + 1]
    
    if kitchen_ID is not None:
        if has_numbers(kitchen_ID) == False:
            if kitchen_name is not None:
                if has_numbers(kitchen_name):
                    if '/' in kitchen_name:
                        kitchen_ID = kitchen_name
    else:
        return kitchen_name, [np.nan]

    kitchen_ID = replace_slash_inside_parentheses(kitchen_ID)

    if kitchen_ID is not None:
        split_ID = kitchen_ID.split('/')
    else:
        return kitchen_name, [np.nan]
    split_ID = list(dict.fromkeys(split_ID))
        
    # If we don't have any numbers        
    if len(split_ID) < 3:
        ID = [split_ID[0] + '/' + split_ID[1]]
    ## Same - but None values
    elif len(split_ID[2]) == 0:
        ID = [split_ID[0] + '/' + split_ID[1]]
    ## In all the other cases 
    else:
        kitchen_real_ID = split_ID[2]
        # If we have comma
        if ',' in kitchen_real_ID:
            ## value is just the kitchen code around the coma
            ID = [split_ID[0] + '/' + split_ID[1] +'/' + str(int(x)) for x in kitchen_real_ID.replace('(', '').replace(')', '').split(',')]
        # If we have '-'
        elif '-' in kitchen_real_ID:
            ## Cut the different number to list all the codes
            kitchen_real_ID_cut = re.split('-|,', kitchen_real_ID.replace('(', '').replace(')', ''))
            # Be sure we remove none values
            kitchen_real_ID_cut =list(filter(None, kitchen_real_ID_cut))
            kitchen_ids_int = [int(item) for item in kitchen_real_ID_cut]

            # Build ID list using the min and max of those ints
            ID = [f"{split_ID[0]}/{split_ID[1]}/{item}" 
              for item in range(min(kitchen_ids_int), max(kitchen_ids_int) + 1)]
            # If we have two value s- list all the kitychen between the two values
            #ID = [split_ID[0] + '/' + split_ID[1] +'/' + str(item) for item in range(int(kitchen_real_ID_cut[0]), int(kitchen_real_ID_cut[1])+1)]
            if(len(kitchen_real_ID_cut) > 2):
                # If more then two value s- list kitchen and add the other kitchens
                kitchen_real_ID_cut = re.split('-|,', kitchen_real_ID.replace('(', '').replace(')', ','))
                kitchen_real_ID_cut =list(filter(None, kitchen_real_ID_cut))
                ID = [split_ID[0] + '/' + split_ID[1] +'/' + str(item) for item in range(int(kitchen_real_ID_cut[0]), int(kitchen_real_ID_cut[1])+1)]
                ID.append([split_ID[0] + '/' + split_ID[1] +'/' + str(item) for item in kitchen_real_ID_cut[2:]])
            # Unlist all the different elvement
            ID = [item for sublist in ID for item in (sublist if isinstance(sublist, list) else [sublist])]
        # In the case we have only one code - direcly 
        else:
            ID = [split_ID[0] + '/' + split_ID[1] +'/' + str(int(kitchen_real_ID))]
    
    if kitchen_name != None:
        kitchen_name = kitchen_name.replace(' ', '')
    else:
        kitchen_name = str(ID[0])

    #Then we have an issue with 2 numbers in JA - 178/179/180 --- which are 78/79/80
    ID = np.array(ID)
    ID[ID == 'KH/JA/178'] = 'KH/JA/78'
    ID[ID == 'KH/JA/179'] = 'KH/JA/79'
    ID[ID == 'KH/JA/180'] = 'KH/JA/80'

    ##Clean and find the only one 
    ID = [id_ for id_ in ID if id_ in kitchen_ids_cluster['kitchen_code'].unique()]

    return kitchen_name, ID

In [None]:
def extract_date(form):
    """
    Extract date from the survey information

    Parameters:
    ----------
    form: list 
        Corresponds to food_name - information containing form details such as name and date from the pdf invoice

    Returns:
    ---------
    date: str
        Name to save the form in a excel format - could be the real name of the kictchen or the ID
    """
    ## Extract information - Date
    # -  First : Date 
    # Extract row and column indices
    matches = form.apply(lambda x: x.astype(str).str.contains(r'\bDate\b', case=False, na=False))
    locations = [(row_idx, col_idx) for row_idx, col in matches.iterrows() for col_idx, match in enumerate(col) if match]
    
    date = [
        form.loc[locations[index][0]].iloc[locations[index][1] + 1].split(',') 
        for index, date in enumerate(locations)
        if form.loc[locations[index][0]].iloc[locations[index][1] + 1] is not None
    ]
    flattened_date = [item for sublist in date for item in sublist]
    
    valid_dates = []
    for date_str in flattened_date:
        try:
            valid_dates.append(datetime.strptime(date_str, "%d/%m/%Y"))
        except ValueError:
            # Ignore entries that don't match the date format
            try:
                valid_dates.append(datetime.strptime(date_str, "%d %B %Y"))
            except ValueError:
                try:
                    valid_dates.append(datetime.strptime(date_str, "%m/%d/%Y"))
                except ValueError:
                    pass
    date = min(valid_dates)
    return date

In [None]:
def extract_food_text_information(form, food_information, potential_name, name_excel_form, kitchen_ids, date, kitchen_ids_cluster):
    """
    Extracts food-related text information from the provided form data..

    Parameters:
    ----------
    form : list
        Correspond to the form_food - information with the food information
        
    food_information : list
        A excel document of the different food>calories information
        
    potential_name : list
        Dfferent names the food can have (e.g. bean - beans)
        
    name_excel_form : str
        The name of the Excel form where the data originated.
        
    kitchen_ids : list of str
        A list of kitchen IDs that are linked to the form data.

    date : str
        Date extracted

    kitchen_ids_cluster : dict
        A dictionary mapping kitchen IDs to their respective clusters. 

    Returns:
    --------
    final_data: dict
        The updated dictionary containing the extracted food-related information, including any relevant details 
        extracted from the form based on the provided parameters (e.g., kitchen IDs, date, and clusters).
    
    final_name: str
        Final name of the form to save as an excel file
        
    sum_calories: float
        Number of calorie provided by the form/invoice
    """
    ## Extract information 
    # Check for the word "Description"
    matches = form.apply(lambda x: x.astype(str).str.contains(r'\bDescription\b', case=False, na=False))
    # Extract row and column indices
    locations_description = [(row_idx, col_idx) for row_idx, col in matches.iterrows() for col_idx, match in enumerate(col) if match]
    
    # Check for the word "TOTAL"
    matches = form.apply(lambda x: x.astype(str).str.contains(r'\bTOTAL\b', case=False, na=False))
    # Extract row and column indices
    locations_total = [(row_idx, col_idx) for row_idx, col in matches.iterrows() for col_idx, match in enumerate(col) if match]
    
    # Extract only information
    form_extract = form.iloc[locations_total[0][0]:locations_total[-1][0],:]
    form_extract = form_extract.dropna(axis=1, how='all')
    
    #Change column name
    form_extract.columns = form_extract.iloc[0]
    form_extract = form_extract.drop(form_extract.index[0])
    
    ## remove space in column name
    form_extract.columns = np.array([x.strip(' ') for x in form_extract.columns ])
    ## remove na in the description
    if 'Description' in form_extract.columns:
        colname_decription = 'Description'
    else:
        colname_decription = ''
        form_extract = form_extract.rename(columns={'': 'Description'})
    form_extract = form_extract.dropna(subset=['Description'])
    
    # now remove information not corresponding to food 
    # Filter to keep only English words
    form_extract['Description'] = [[word.lower().strip(' ').replace(" ", '_') for word in pair if re.match(r'^[a-zA-Z\s]+$', word)] for pair in [x.split('/') for x in np.array(form_extract['Description'])]]

    ##Remove na values - when no number 
    form_extract.loc[form_extract['No.'] == '', 'No.'] = np.nan
    form_extract = form_extract.dropna(subset=['No.'])
    # Remove empty sublists
    form_extract = form_extract[form_extract['Description'].apply(len) > 0]
    form_extract['Description']  = [x[0] for x in form_extract['Description']]
    
    # Doing the same for the Unit column
    # now remove information not corresponding to food 
    form_extract['Unit'] = form_extract['Unit'].fillna('0')
    # Filter to keep only English words
    form_extract['Unit'] = [[word.lower().strip(' ').replace(" ", '_') for word in pair if re.match(r'^[a-zA-Z\s]+$', word)] for pair in [x.split('/') for x in np.array(form_extract['Unit'])]]
    # Transform empty sublist to Nan
    form_extract['Unit']  = [x[0] if len(x) != 0 else np.nan for x in form_extract['Unit']]
    
    # remove lines when the quantity is not known
    form_extract = form_extract.dropna(subset=['QTY'])
    
    # Only select lines when the description is corresponding to a food name (ex not transportation)
    form_extract = form_extract[form_extract['Description'].isin(potential_name)]
    
    form_extract['Description'] = [x if x in np.array(food_information['food_name']) else np.array(food_information[food_information['possible_name'] == x]['food_name'])[0] for x in form_extract['Description']] 
    
    # Join the form_extract with their values in term of kg per foom
    merge_data = pd.merge(food_information, form_extract, how='right', left_on='food_name', right_on='Description')
    
    #Select and rename important column 
    final_data = merge_data[['Description', 'Unit', 'QTY', 'Unit Price', 'Total', 'calories_per_kg']]
    final_data.columns = ['food_name', 'unit', 'quantity', 'unit_price', 'total_price', 'calories_per_kg']

    name = [str(x).replace('.xlsx', '') for x in pathlib.Path('../output/form_clearance/').rglob('*.xlsx')]
    num_includ = [item for i,item in enumerate(name) if "../output/form_clearance/" + name_excel_form.replace('.xlsx', '') in item]

    # before calculating calories 
    #fill quantity values with NAN
    final_data['quantity'] = final_data['quantity'].replace('', np.nan)
    final_data, sum_calories = food_to_calories(final_data, food_information, kitchen_ids, date, kitchen_ids_cluster)

    if len(num_includ) == 0:
        #Save data into excel sheet using the name define previously
        final_name = "../output/form_clearance/" + name_excel_form.replace('.xlsx', '') +'_0' + '.xlsx'
        final_data.to_excel(final_name, index = False)  
    else:
        number = max([int(x.split('_')[-1]) for x in num_includ])
        #Save data into excel sheet using the name define previously
        final_name = "../output/form_clearance/" + name_excel_form.replace('.xlsx', '') + '_' + str(number +1) + '.xlsx'
        final_data.to_excel(final_name+ '.xlsx', index = False)    
        
    return final_data, final_name, sum_calories

In [None]:
## Return name file where information will be saved
def define_name_excel_sheet(ID, kitchen_name, date):
    """
    Define the name to save the different document

    Parameters:
    ----------
    ID: list 
        list of all the kitchens ids included in the form/survey
        
    date: str
        date of the survey/form

    kitchen_name: str
        kitchen name of the survey/form could correspond to the real name, cluster name or ID

    Returns:
    ---------
    name_excel_form: str
        Name to save the information of the survey in an excel way
    
    name_excel_picture: str
        Name to save the different picture of the survey
    """
    
    if kitchen_name == None:
        name_excel_form = 'extract_food_kitchen_' + 'not_defined' + '.xlsx'
        name_excel_picture = 'extract_food_images_kitchen_'  + 'not_defined' + '.xlsx'
    elif len(ID) != 0:
        #  - Fourth - Develop name of the ID for the different excel
        name_excel_form = 'extract_food_kitchen_' + kitchen_name.replace('/', '_') + '_' + calendar.month_name[date.month]+  '.xlsx'
        name_excel_picture = 'extract_food_images_kitchen_'  + kitchen_name.replace('/', '_') + '.xlsx'
    else:
        name_excel_form = 'extract_food_kitchen_' + 'not_defined' + '_' + calendar.month_name[date.month] + '.xlsx'
        name_excel_picture = 'extract_food_images_kitchen_'  + 'not_defined' + '.xlsx'
    return name_excel_form, name_excel_picture

In [None]:
def find_subfolder(folder_name):
    """
    Function to find all the subfolder of a folder
    """
    ## Find all the subfolders where data have been saved
    path = Path(folder_name)
    subfolders = [] 
    # Iterate through all directories and subdirectories
    for folder in path.rglob('*'):
        if folder.is_dir():
            # List subfolders
            subfolders.append([f for f in folder.iterdir() if f.is_dir()])
            
    def flatten(xss):
        return [x for xs in xss for x in xs]
    subfolders = flatten(subfolders)
    subfolders.append(Path('../../1. Data available/ISHTM_Hadhreen/Invoice_Forms_All_JA/'))
    subfolders = [sub for sub in subfolders if '.ipynb_' not in str(sub)]
    subfolders = [
        path for path in subfolders
        if not any(other != path and path in other.parents for other in subfolders)
    ]
    return subfolders

In [None]:
## Link both data and calculate calories available in the different kitchens
def food_to_calories(food_data, food_information, kitchen_ids, date, kitchen_ids_cluster):
    """
    Extract food information in the invoice/form and translate them into calories
    """
    ## Merge both data 
    food_data_res = pd.merge(food_data, food_information[['food_name','kg per unit', 'protein', 'lipid']], on='food_name', how='left')
    food_data_res['quantity'] = (
        food_data_res['quantity']
        .astype(str)  # Ensure it's a string first
        .str.replace(" ", "", regex=False)  # Remove spaces
        .replace("-", np.nan)  # Replace hyphens with NaN
        .astype(float)  # Convert to float
    )
    
    ## Check if quantity is accessible
    missing_quantity = food_data_res.loc[food_data_res['quantity'].isna()]
    complete_quantity = []
    if len(missing_quantity) != 0:
        if(len(kitchen_ids) != 0):
            _ids = kitchen_ids_cluster[['kitchen_locality']].loc[kitchen_ids_cluster['kitchen_code'] == kitchen_ids[0]]
            if len(_ids) != 0:
                locality = kitchen_ids_cluster[['kitchen_locality']].loc[kitchen_ids_cluster['kitchen_code'] == kitchen_ids[0]].values[0][0]
                for index, row in missing_quantity.iterrows():
                    fd_name = row['food_name']
                    # Target date
                    target_date = date
                    select_item = price_item.loc[price_item['kitchen_locality'] == locality][price_item['type_of_food'] == fd_name]
                    if len(select_item)!=0:
                        # Find the row with the closest date
                        closest_row = price_item.iloc[(select_item['date'] - target_date).abs().idxmin()]
                        # Extract the price
                        closest_price = closest_row['price']
                    else:
                        complete_quantity.append(0)
                        continue
                    ## If the closest price doesnt not exist - select the one from another locality where the price is existing for the same date
                    if math.isnan(closest_price):
                        select_item = price_item.loc[price_item['type_of_food'] == fd_name]
                        if len(select_item)!=0:
                            # Find the row with the closest date
                            closest_row = price_item.iloc[(select_item['date'] - target_date).abs().idxmin()]
                            # Extract the price
                            select_item = select_item.loc[select_item['date'] == price_item.iloc[(select_item['date'] - target_date).abs().idxmin()]['date']]
                            ##extract price
                            closest_price = np.mean(select_item[['price']])
                    ## Else I am going to calculate the average price for the period
                    if math.isnan(closest_price):
                        select_item = price_item.loc[price_item['type_of_food'] == fd_name]
                        closest_price = np.mean(select_item[['price']])
                    total_price = row['total_price'].replace(" ", "").replace(",", "")  # Remove commas
                    total_price = float(total_price)  # Convert to float
                    complete_quantity.append(str(np.round(total_price/closest_price, 0)))
                    
                food_data_res.loc[food_data_res['quantity'].isna(), 'quantity'] = complete_quantity
    #Calculate total calories per aliments
    food_data_res['total_calories'] = food_data_res['quantity'].astype(float)*food_data_res['calories_per_kg'].astype(float)*food_data_res['kg per unit'].astype(float)
    food_data_res['protein'] = food_data_res['quantity'].astype(float)*food_data_res['protein'].astype(float)*food_data_res['kg per unit'].astype(float)
    food_data_res['lipid'] = food_data_res['quantity'].astype(float)*food_data_res['lipid'].astype(float)*food_data_res['kg per unit'].astype(float)
    #Sum total calories
    sum_calories = food_data_res['total_calories'].sum()
    return food_data_res, sum_calories

In [None]:
## Function to swap days as some date use US and other UK dates
def adjust_and_swap(date):
    try:
        # If before July, not possible
        if date.month < 7 and date.year == 2024:
            date = date.replace(day=date.month, month=date.day)
        
        # Swap day and month
        return date
    except ValueError:
        # Handle invalid dates after swap
        return pd.NaT
    
mayo_kitchen = ['KH/JA/120', 'KH/JA/121', 'KH/JA/122', 'KH/JA/123', 'KH/JA/124', 'KH/JA/125',
                'KH/JA/126', 'KH/JA/127', 'KH/JA/128', 'KH/JA/129', 'KH/JA/130', 'KH/JA/131',
                'KH/JA/132', 'KH/JA/133', 'KH/JA/134']

## PART 1: Clean, extract information and save the different survey in the right folder

In [None]:
### Extract where invoice are saved - and find all the subfolders
folder_name = '../../1. Data available/ISHTM_Hadhreen/Invoice_Forms_All_JA/'
sub_folders = find_subfolder(folder_name)

### Read ids_cluster 
kitchen_ids_cluster = pd.read_excel('../output/kitchen_ids_cluster.xlsx')

## Read price itemps
price_item = pd.read_excel('../output/clean_item_price_time.xlsx')
# Convert the 'date' column to datetime format
price_item['date'] = pd.to_datetime(price_item['date'], format='%d/%m/%Y')

## Read food to calories docuements
## Information important for the next steps:
food_information =  pd.read_excel('../../1. Data available/ISHTM_Hadhreen/Food to Calories/food_names_and_calories.xlsx')
food_information['possible_name'] = [x.lower().strip(' ').replace(" ", '_') if type(x) == str else np.nan for x in food_information['possible_name']]
food_information['food_name'] = np.array([x.lower().strip(' ').replace(" ", '_') for x in food_information['food_name']])
potential_name = np.append(np.array(food_information['possible_name'].dropna()), np.array(food_information['food_name']))

for sub in sub_folders:
    #base_fld = pathlib.Path('../../1. Data available/ISHTM_Hadhreen/Invoice Forms_Jul-Dec/Invoices/Aug/Batch3/')
    information = []
    for file_name in sub.rglob('*.pdf'):
        print(file_name)
            # Open the PDF file
        with pdfplumber.open(file_name) as pdf:
            all_data = []
            for page in pdf.pages:
                # Extract table(s) from the page
                table = page.extract_tables()
                if table:
                    all_data.extend(table)
        if len(all_data) == 0:
            print('nulle')
        elif '\n' in all_data[0][0][0]:
            form_name = all_data[1]
            form_food = all_data[2]
        elif 'Team Leader' in all_data[1][0]:
            form_name = all_data[0]
            # Extract from 'No.' to the first empty row
            start_index = None
            end_index = None

            for i, row in enumerate(all_data[0]):
                if start_index is None and row[0] == 'No.':
                    start_index = i
                elif start_index is not None and (row[0] is None or row[0] == 'TOTAL'):
                    end_index = i
                    break

            form_food = all_data[0][start_index:(end_index+1)]
            
        else:
            form_name = all_data[0]
            form_food = all_data[1]
        # Convert the extracted data into a DataFrame
        form_name = pd.DataFrame(form_name)  # First row as columns
        form_food = pd.DataFrame(form_food)  # First row as columns
        # df.to_excel('output.xlsx', index=False, engine='openpyxl')
        
        # # read the excel and extract the right sheet_names
        # xl_file = pd.ExcelFile(file_name)
        
        # dfs = {sheet_name: xl_file.parse(sheet_name) 
        #           for sheet_name in xl_file.sheet_names}
        
        # form = dfs['FORM']
        
        ## Extract information - Name where Kitchen is + number of kitchen involved
        kitchen_name, kitchen_ids = extract_kitchen_ID_Name(form_name, kitchen_ids_cluster)
        
        ## Extract date
        date = extract_date(form_name)
        
        # Return name where information will be saved
        name_excel_form, name_excel_picture = define_name_excel_sheet(kitchen_ids, kitchen_name, date)
        
        ## Extract and save information (food from text)
        food_data, food_excel_name, sum_calories = extract_food_text_information(form_food, food_information, potential_name, 
                                                                                 name_excel_form, kitchen_ids, date, kitchen_ids_cluster)
        food_total = [float(x.replace(" ", "").replace(",", "")) for x in food_data['total_price']]
        total_cost = sum(food_total)
        different_type_of_food = np.unique(food_data['food_name'])
        proteins = sum(food_data['protein'])
        lipids = sum(food_data['lipid'])
        
        ## Save information from the survey
        information.append([kitchen_name, kitchen_ids, len(kitchen_ids), date, food_excel_name, total_cost, 
                            len(different_type_of_food), sum_calories, proteins, lipids, file_name])
    
    
    all_data = pd.DataFrame(information)
    all_data.columns = ['kitchen_name', 'kitchen_ids', 'nb_kitchen', 'date', 'excel_food_saved', 'total_cost', 
                        'nb_type_food', 'total_calories_available', 'g_proteins', 'g_lipids', 'old_file_name']
    file_path = Path("../output/form_clearance/metadata.xlsx")
    if file_path.exists():
        metadata = pd.read_excel("../output/form_clearance/metadata.xlsx")
        results = pd.concat([metadata, all_data], ignore_index=True)
        results.to_excel('../output/form_clearance/metadata.xlsx', index = False)  
    else:
        all_data.to_excel('../output/form_clearance/metadata.xlsx', index = False)   

## PART 2: Clean metadata to have access to the number of day between two invoices

In [None]:
## Read data
metadata_invoice = pd.read_excel('../output/form_clearance/metadata.xlsx')
metadata_invoice['date'] = [adjust_and_swap(date) for date in metadata_invoice['date']]
#Some kictchen have the wring names
metadata_invoice.loc[metadata_invoice['kitchen_name'] == 'KH/MA', 'kitchen_ids'] = str(mayo_kitchen)
metadata_invoice.loc[metadata_invoice['kitchen_name'] == 'KH/MA', 'nb_kitchen'] = 15
metadata_invoice.loc[metadata_invoice['kitchen_name'] == 'KH/JA', 'kitchen_ids'] = "['KH/JA/159']"
metadata_invoice['kitchen_ids'] = [code.replace("/MA/","/JA/") for code in metadata_invoice['kitchen_ids']]
## Remove metadata with unknown files
print('Before cleaning:', len(metadata_invoice))
metadata_invoice = metadata_invoice.loc[metadata_invoice['kitchen_ids'] != '[]']
metadata_invoice = metadata_invoice.loc[metadata_invoice['kitchen_ids'] != '[nan]']
print('After cleaning:', len(metadata_invoice))

## Nb of kictehcn with 0 - corresponds to 1 cluster with the kitchen init
metadata_invoice.loc[metadata_invoice['nb_kitchen'] == 0, 'nb_kitchen'] = 1

# Remove when the number of calories = 0
metadata_invoice = metadata_invoice.loc[metadata_invoice['total_calories_available'] != 0]
print('After cleaning:', len(metadata_invoice))

## ALso remove invoice where there is an underreported number of calories for the period
metadata_invoice = metadata_invoice.loc[metadata_invoice['total_calories_available'] > 1000000]

## First exapend the data to stop having clusters
# Step 1: Convert 'kitchen_ids' column from string to actual list
metadata_invoice['kitchen_ids'] = metadata_invoice['kitchen_ids'].apply(eval)  # Using eval to convert string to list
metadata_invoice['kitchen_cluster'] = metadata_invoice['kitchen_ids']
# Step 2: Use explode to split each ID into separate rows
metadata_invoice = metadata_invoice.explode('kitchen_ids')

# Step 3: Reset index for a clean DataFrame
metadata_invoice = metadata_invoice.reset_index(drop=True)

## Then calculate gap between two invoices
# Sort data by kitchen_code and date
metadata_invoice = metadata_invoice.sort_values(by=['kitchen_ids', 'date']).reset_index(drop=True)

# Calculate days to next survey within the same kitchen_code
metadata_invoice['days_to_next'] = metadata_invoice.groupby('kitchen_ids')['date'].diff(-1).dt.days.abs()

# Fill missing values or cap to a maximum of 12 days
metadata_invoice['min_days_to_next'] = metadata_invoice['days_to_next'].fillna(7).clip(upper=7)
metadata_invoice['max_days_to_next'] = metadata_invoice['days_to_next'].fillna(14).clip(upper=14)
metadata_invoice.loc[metadata_invoice['min_days_to_next'] < 7, 'min_days_to_next'] = 5
metadata_invoice.loc[metadata_invoice['max_days_to_next'] < 7, 'max_days_to_next'] = 5

# Create start_date and end_date columns
metadata_invoice['start_date'] = metadata_invoice['date']

## When mutiple surveys on the same date and kitchen code 
metadata_invoice['min_days_to_next'] = metadata_invoice.groupby(['kitchen_ids', 'date'])['min_days_to_next'].transform('max')
metadata_invoice['max_days_to_next'] = metadata_invoice.groupby(['kitchen_ids', 'date'])['max_days_to_next'].transform('max')

#Calculate end_date - Min day - if each invoice is only 7 days - max - if need to separate the food for 14 days
metadata_invoice['min_end_date'] = metadata_invoice['start_date'] + pd.to_timedelta(metadata_invoice['min_days_to_next'], unit='D')
metadata_invoice['max_end_date'] = metadata_invoice['start_date'] + pd.to_timedelta(metadata_invoice['max_days_to_next'], unit='D')

## Now we can calculate the number of calories per days available per kitchen
metadata_invoice['max_total_calories_per_days'] = round(metadata_invoice['total_calories_available'] / metadata_invoice['min_days_to_next'], 3)
metadata_invoice['min_total_calories_per_days'] = round(metadata_invoice['total_calories_available'] / metadata_invoice['max_days_to_next'], 3)

##Save the excel file
metadata_invoice.to_excel('../output/cleaned_metadata.xlsx', index=False)

In [None]:
print('Number of different kitchens: ', len(metadata_invoice['kitchen_ids'].unique()))

## Visualization

In [None]:
plt.figure(figsize=(12, 8))

# Select data with knowing date
metadata_invoice = metadata_invoice.loc[metadata_invoice['days_to_next'] < 121]
#metadata_invoice = metadata_invoice.loc[metadata_invoice['days_to_next'] != 0]

## When mutiple surveys on the same date and kitchen code 
metadata_invoice['days_to_next'] = metadata_invoice.groupby(['kitchen_ids', 'date'])['days_to_next'].transform('max')
# 0 doesnt make sense especially for division 
metadata_invoice.loc[metadata_invoice['days_to_next'] == 0, 'days_to_next'] = 1


#Calculate end_date
metadata_invoice['end_date'] = metadata_invoice['start_date'] + pd.to_timedelta(metadata_invoice['days_to_next'], unit='D')


kitchens = metadata_invoice['kitchen_ids'].unique()
colors = cm.get_cmap('Set2', len(kitchens))

# Plot each kitchen's active period
for i, kitchen in enumerate(kitchens):
    kitchen_data = metadata_invoice[metadata_invoice['kitchen_ids'] == kitchen]
    for _, row in kitchen_data.iterrows():
        plt.plot([row['start_date'], row['end_date']], [i, i], marker='o', color=colors(i), label=kitchen if i == 0 else None)

plt.yticks(range(len(metadata_invoice['kitchen_ids'].unique())), metadata_invoice['kitchen_ids'].unique(), fontsize=7,rotation=20)
plt.xticks(rotation=45)
#plt.title('Invoice Coverage Per Kitchen (Gantt Chart)')
plt.xlabel('Date')
plt.ylabel('Kitchen Code', fontsize=12)
plt.grid(axis='x')
plt.savefig('../visualization/kitchen_invoices_availability_gantt_without_imputation.png', dpi=300)
#plt.legend(title='Kitchen Code', loc='upper right')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))

kitchens = metadata_invoice['kitchen_ids'].unique()
colors = cm.get_cmap('Set2', len(kitchens))

# Plot each kitchen's active period
for i, kitchen in enumerate(kitchens):
    kitchen_data = metadata_invoice[metadata_invoice['kitchen_ids'] == kitchen]
    for _, row in kitchen_data.iterrows():
        plt.plot([row['start_date'], row['min_end_date']], [i, i], marker='o', color=colors(i), label=kitchen if i == 0 else None)
        plt.plot([row['start_date'], row['max_end_date']], [i, i], marker='x', color=colors(i), label=kitchen if i == 0 else None)

plt.yticks(range(len(metadata_invoice['kitchen_ids'].unique())), metadata_invoice['kitchen_ids'].unique(), fontsize=7,rotation=20)
plt.xticks(rotation=45)
#plt.title('Invoice Coverage Per Kitchen (Gantt Chart)')
plt.xlabel('Date')
plt.ylabel('Kitchen Code', fontsize=12)
plt.grid(axis='x')
plt.savefig('../visualization/kitchen_invoices_availability_gantt.png', dpi=300)
#plt.legend(title='Kitchen Code', loc='upper right')
plt.show()

In [None]:
# Group by kitchen_code to get average calories per kitchen
calories_kitchen = metadata_invoice.groupby('kitchen_ids')['min_total_calories_per_days'].mean().reset_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=calories_kitchen, x='kitchen_ids', y='min_total_calories_per_days', color='lightsteelblue')
plt.xlabel('Kitchen Code')
plt.ylabel('Average Calories Per Day')
plt.xticks(rotation=45, fontsize=8)
plt.savefig('../visualization/kitchen_average_calories.png', dpi=300)
plt.show()

In [None]:
## Visualization calories overtime
calories_time = metadata_invoice.groupby('date')['min_total_calories_per_days'].sum().reset_index()
calories_time_max = metadata_invoice.groupby('date')['max_total_calories_per_days'].sum().reset_index()

# Merge the two datasets to align x-axis positions
calories_combined = pd.merge(calories_time, calories_time_max, on='date')

# Define bar width
bar_width = 0.4  
dates = calories_combined['date']

# Set figure size
plt.figure(figsize=(14, 10))

# Create position indices
x = range(len(dates))

# Plot the bars side by side
plt.bar([pos - bar_width/2 for pos in x], calories_combined['min_total_calories_per_days'], 
        width=bar_width, label='Min Calories', color='orange')
plt.bar([pos + bar_width/2 for pos in x], calories_combined['max_total_calories_per_days'], 
        width=bar_width, label='Max Calories', color='lightsteelblue')

# Format x-axis ticks to show only every 10th day
tick_positions = range(0, len(dates), 10)  # Indices of every 10th date
tick_labels = dates.iloc[tick_positions].dt.strftime('%Y-%m-%d')  # Format dates
plt.xticks(tick_positions, tick_labels, rotation=45)

# Labels and legend
plt.xlabel('Date')
plt.ylabel('Total Calories')
plt.legend()
plt.grid(axis='x')

# Save and show
plt.savefig('../visualization/kitchen_tot_cals_per_days.png', dpi=300)
plt.show()
