<h4> Dividing HRW data per month

In [None]:
import pandas as pd

df = pd.read_csv('clean_recipe_web.csv')

df['date'] = pd.to_datetime(df['date'], errors='coerce')

df['month'] = df['date'].dt.month

columns_to_drop = ['doc_url', 'h', 'w', 'identified_language', 'ocr_score', 'quant', 'tags']
df_cleaned = df.drop(columns=columns_to_drop)

df_cleaned.rename(columns={df_cleaned.columns[0]: 'Sr no'}, inplace=True)

for month in range(1, 13): 
    df_month = df_cleaned[df_cleaned['month'] == month]
    df_month.to_csv(f'recipes_month_{month}.csv', index=False)

    print(f"Data for month {month} has been saved as 'recipes_month_{month}.csv'")


<h4> Seasonal Usage

In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# Cookbooks dataset
cookbook_file = "cookbook_months.csv"
cookbook_df = pd.read_csv(cookbook_file)

# HRW dataset
newspaper_files = [f"recipes_month_{i}.csv" for i in range(1, 13)]
newspaper_df_list = [pd.read_csv(f) for f in newspaper_files]
newspaper_df = pd.concat(newspaper_df_list, ignore_index=True)

def clean_ingredients(ingredient_str):
    if pd.isna(ingredient_str):
        return set()
    return set(ing.strip().lower() for ing in ingredient_str.split(","))

cookbook_df["Ingredients"] = cookbook_df["Ingredients"].apply(clean_ingredients)
newspaper_df["Ingredients"] = newspaper_df["Ingredients"].apply(clean_ingredients)

cookbooks_ingredients_per_month = cookbook_df.groupby("month")["Ingredients"].apply(lambda x: set.union(*x))

alignment_results = []

for month, ideal_ingredients in cookbooks_ingredients_per_month.items():
    month_newspaper = newspaper_df[newspaper_df["month"] == month]
    
    matching_recipes = month_newspaper["Ingredients"].apply(lambda x: len(x & ideal_ingredients) > 1).sum()
    
    total_recipes = len(month_newspaper)
    
    alignment_percentage = (matching_recipes / total_recipes) * 100 if total_recipes > 0 else 0
    
    alignment_results.append({"month": month, "alignment_percentage": alignment_percentage})

alignment_df = pd.DataFrame(alignment_results)

print(alignment_df)