In [None]:
%pip install isodate

In [99]:
import isodate
# Function to convert ISO 8601 duration to human-readable format
def convert_iso_duration(iso_duration):
    # Handle non-string values (e.g., None or NaN)
    if not isinstance(iso_duration, str):
        return "unknown"
    
    try:
        # Parse the ISO 8601 duration string using isodate
        duration = isodate.parse_duration(iso_duration)
        
        # Extract total seconds and break them into days, hours, minutes, seconds
        total_seconds = int(duration.total_seconds())
        days, remainder = divmod(total_seconds, 86400)  # 1 day = 86400 seconds
        hours, remainder = divmod(remainder, 3600)  # 1 hour = 3600 seconds
        minutes, seconds = divmod(remainder, 60)  # 1 minute = 60 seconds

        # Build human-readable string
        readable = []
        if days > 0:
            readable.append(f"{days} day{'s' if days > 1 else ''}")
        if hours > 0:
            readable.append(f"{hours} hour{'s' if hours > 1 else ''}")
        if minutes > 0:
            readable.append(f"{minutes} minute{'s' if minutes > 1 else ''}")
        if seconds > 0:
            readable.append(f"{seconds} second{'s' if seconds > 1 else ''}")

        return ", ".join(readable)
    except Exception as e:
        # If there is an error (e.g., invalid format), return an error message
        return "unknown"
# Example usage
iso_duration = "PT24H"  # 24 hours
print(convert_iso_duration(None))  # Output: 24 hours

unknown


In [100]:
# Define a cleaning function
def clean_ingredients(ingredient_str):
    # Remove 'c (' and ')', and clean double quotes and spaces
    ingredient_str = ingredient_str.replace('c(', '').replace(')', '')  # Remove 'c (' and ')'
    ingredient_str = ingredient_str.replace('"', '')  # Remove double quotes
    return ingredient_str

In [101]:
#convert RecipeIngredientParts and RecipeIngredientQuantities to a list
import ast

def parse_list(r_string):
    try:
        if r_string == "character(0)":
            return []  # Handle character(0) as an empty list
        return ast.literal_eval(r_string.replace('c(', '[').replace(')', ']'))
    except Exception:
        return []  # Return None for invalid rows


In [102]:
from fractions import Fraction

# Function to convert a string to a float, handling fractions like "1/4"
def convert_to_float(value):
    try:
        # Try to parse the string as a fraction
        return float(Fraction(value))
    except ValueError:
        # If it fails (e.g., not a fraction or float), return 0.0
        return 0.0
    
# Convert Quantities from strings to floats (handling fractions and decimals)
#data_sup['Quantities'] = data_sup['Quantities'].apply(
#    lambda x: [convert_to_float(q) for q in x]  # Convert each string in the list
#)

1. Getting the Data

In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:.2f}'.format

# Load dataset
data = pd.read_csv('recipes.csv')
print("Dataset Loaded Successfully")


Dataset Loaded Successfully


2. Exploring the Data

In [104]:
# Overview of the dataset
print("Dataset Information:")
data.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517 non-null  int64  
 3   AuthorName                  522517 non-null  object 
 4   CookTime                    439972 non-null  object 
 5   PrepTime                    522517 non-null  object 
 6   TotalTime                   522517 non-null  object 
 7   DatePublished               522517 non-null  object 
 8   Description                 522512 non-null  object 
 9   Images                      522516 non-null  object 
 10  RecipeCategory              521766 non-null  object 
 11  Keywords                    505280 non-null  object 
 12  RecipeIngredientQuantities  522514 non-null  object

In [105]:
# Shape and statistical description
print("\nDataset Shape:", data.shape)
print("\nStatistical Summary:")
print(data.describe(include='all'))


Dataset Shape: (522517, 28)

Statistical Summary:
        RecipeId          Name      AuthorId       AuthorName CookTime  \
count  522517.00        522517     522517.00           522517   439972   
unique       NaN        438188           NaN            56793      490   
top          NaN  Banana Bread           NaN  ratherbeswimmin    PT30M   
freq         NaN           186           NaN             7742    50715   
mean   271821.44           NaN   45725847.89              NaN      NaN   
std    155495.88           NaN  292971448.67              NaN      NaN   
min        38.00           NaN         27.00              NaN      NaN   
25%    137206.00           NaN      69474.00              NaN      NaN   
50%    271758.00           NaN     238937.00              NaN      NaN   
75%    406145.00           NaN     565828.00              NaN      NaN   
max    541383.00           NaN 2002886148.00              NaN      NaN   

       PrepTime TotalTime         DatePublished  \
count    

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100
print("\nMissing Values Count and Percentage:")
print(missing_percentage)

In [None]:
data.head(10)

3. Data Cleaning and Preprocessing

In [106]:
# Feature extraction
selected_columns = [
    'RecipeId', 'Name', 'CookTime','RecipeServings','RecipeCategory','RecipeIngredientQuantities','RecipeIngredientParts', 'AggregatedRating','Calories', 'FatContent', 'SaturatedFatContent',
    'CholesterolContent', 'SodiumContent', 'CarbohydrateContent','FiberContent', 'SugarContent', 'ProteinContent', 'RecipeInstructions'
]
data_extracted = data[selected_columns]
print("\nData Extracted Completed. Preview:")
data_extracted.shape


Data Extracted Completed. Preview:


(522517, 18)

In [107]:
data_extracted.isnull().sum()

RecipeId                           0
Name                               0
CookTime                       82545
RecipeServings                182911
RecipeCategory                   751
RecipeIngredientQuantities         3
RecipeIngredientParts              0
AggregatedRating              253223
Calories                           0
FatContent                         0
SaturatedFatContent                0
CholesterolContent                 0
SodiumContent                      0
CarbohydrateContent                0
FiberContent                       0
SugarContent                       0
ProteinContent                     0
RecipeInstructions                 0
dtype: int64

In [117]:
# Handling missing values
data_cleaned = data_extracted.copy() #data_cleaned.dropna(inplace=True)
data_cleaned.CookTime = data_cleaned.CookTime.apply(convert_iso_duration)
data_cleaned.CookTime.head(20)

0          1 day
1     25 minutes
2      5 minutes
3     20 minutes
4     30 minutes
5        2 hours
6      3 minutes
7     50 minutes
8        unknown
9      9 minutes
10       unknown
11    30 minutes
12    50 minutes
13    25 minutes
14       unknown
15    45 minutes
16    50 minutes
17       2 hours
18        1 hour
19       unknown
Name: CookTime, dtype: object

In [118]:
data_cleaned = data_cleaned.dropna(subset=["RecipeCategory"])

In [119]:
data_cleaned = data_cleaned.dropna(subset=["RecipeIngredientQuantities"])
data_cleaned['RecipeIngredientParts'] = data_cleaned['RecipeIngredientParts'].apply(clean_ingredients)


In [120]:
data_cleaned["AggregatedRating"] = data_cleaned["AggregatedRating"].fillna(data_cleaned["AggregatedRating"].mean())

In [121]:
# Apply parsing safely
data_cleaned["RecipeInstructions"] = data_cleaned["RecipeInstructions"].apply(parse_list)
data_cleaned["RecipeIngredientQuantities"] = data_cleaned["RecipeIngredientQuantities"].apply(parse_list)


In [122]:
data_cleaned["RecipeServings"] = pd.to_numeric(data_cleaned["RecipeServings"], errors='coerce')

In [123]:
data_RecipeServings_nan = data_cleaned.loc[data_cleaned['RecipeServings'].isnull() == True]
data_RecipeServings_nan.head()


Unnamed: 0,RecipeId,Name,CookTime,RecipeServings,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions
8,46,A Jad - Cucumber Pickle,unknown,,Vegetable,"[1/2, 5, 2, 1, 1, 1]","rice vinegar, haeo",5.0,4.3,0.0,0.0,0.0,0.7,1.1,0.2,0.2,0.1,"[Slice the cucumber in four lengthwise, then s..."
9,47,Butter Pecan Cookies,9 minutes,,Dessert,"[3/4, 1/2, 1, 1, 1, 2, 1]","butter, brown sugar, granulated sugar, vanilla...",4.0,69.0,5.6,1.4,6.3,15.0,4.5,0.6,1.6,0.8,"[Preheat oven to 350 degrees., Cream butter in..."
12,50,Biscotti Di Prato,50 minutes,,Dessert,"[3 3/4, 2, 1, 1/4, 4, 1, 1/2, 1 2/3, 1, 1]","flour, sugar, baking powder, salt, eggs, vanil...",4.5,89.4,2.6,0.3,16.6,23.5,14.5,0.8,7.3,2.3,"[EGG WASH 1 Egg, large 1 tsp Water In the bowl..."
15,53,Jimmy G's Carrot Cake,45 minutes,,Dessert,[],"all-purpose flour, sugar, baking powder, bakin...",3.0,372.9,10.6,3.2,62.0,190.0,66.2,3.4,40.9,6.6,[Grease and lightly flour 2 9x1 1/2-inch round...
23,61,Brownie Heart Cake,42 minutes,,Dessert,[],"brown sugar, butter, vanilla, eggs, all-purpos...",4.63,4713.8,286.5,144.2,1097.5,2157.8,509.9,29.0,392.5,71.7,[CAKE: Grease 5 cup heart shaped pan; dust wi...


In [124]:
# Nutritional maximum thresholds for a single meal (assuming 3 meals a day)
max_thresholds_per_meal = {
    'Calories': 2000 / 3,  # Daily caloric intake divided by 3 meals
    'FatContent': 70 / 3,  # Fat content per meal
    'SaturatedFatContent': 22 / 3,  # Saturated fat per meal
    'CholesterolContent': 300 / 3,  # Cholesterol per meal
    'SodiumContent': 2300 / 3,  # Sodium per meal
    'CarbohydrateContent': 325 / 3,  # Carbohydrates per meal
    'FiberContent': 25 / 3,  # Fiber per meal
    'SugarContent': 50 / 3,  # Sugar per meal
    'ProteinContent': 175 / 3  # Protein per meal
}

data_cleaned_copy = data_cleaned.copy()

# Appliquer les filtres en fonction des seuils nutritionnels
def clean_recipe(row):
    for column, maximum in max_thresholds_per_meal.items():
    
        if column in row and row[column] >= maximum:
            return None  # Si une valeur dépasse le seuil, on supprime cette ligne
    # Si toutes les valeurs sont inférieures au seuil, on met 'RecipeServings' à 1
    row['RecipeServings'] = 1
    return row

data_cleaned_copy[data_cleaned_copy['RecipeServings'].isnull()].apply(clean_recipe, axis=1)


# Supprimer les lignes où 'clean_recipe' a renvoyé None (lignes à supprimer)
data_cleaned_copy = data_cleaned_copy.dropna(subset=['RecipeServings'])


# Afficher les premières lignes du DataFrame nettoyé
data_cleaned_copy.head()


Unnamed: 0,RecipeId,Name,CookTime,RecipeServings,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1 day,4.0,Frozen Desserts,"[4, 1/4, 1, 1]","blueberries, granulated sugar, vanilla yogurt,...",4.5,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39,Biryani,25 minutes,6.0,Chicken Breast,[],"saffron, milk, hot green chili peppers, onions...",3.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,[Soak saffron in warm milk for 5 minutes and p...
2,40,Best Lemonade,5 minutes,4.0,Beverages,[],"sugar, lemons, rind of, lemon, zest of, fresh ...",4.5,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41,Carina's Tofu-Vegetable Kebabs,20 minutes,2.0,Soy/Tofu,"[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1/2, 1/...","extra firm tofu, eggplant, zucchini, mushrooms...",4.5,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,"[Drain the tofu, carefully squeezing out exces..."
4,42,Cabbage Soup,30 minutes,4.0,Vegetable,"[46, 4, 1, 2, 1]","plain tomato juice, cabbage, onion, carrots, c...",4.5,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,"[Mix everything together and bring to a boil.,..."


In [126]:
data_cleaned_copy.shape


(339121, 18)

In [127]:
data_cleaned_copy.isnull().sum()

RecipeId                      0
Name                          0
CookTime                      0
RecipeServings                0
RecipeCategory                0
RecipeIngredientQuantities    0
RecipeIngredientParts         0
AggregatedRating              0
Calories                      0
FatContent                    0
SaturatedFatContent           0
CholesterolContent            0
SodiumContent                 0
CarbohydrateContent           0
FiberContent                  0
SugarContent                  0
ProteinContent                0
RecipeInstructions            0
dtype: int64

In [128]:
# Afficher les premières lignes du DataFrame après division
data_cleaned_copy.head()

Unnamed: 0,RecipeId,Name,CookTime,RecipeServings,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions
0,38,Low-Fat Berry Blue Frozen Dessert,1 day,4.0,Frozen Desserts,"[4, 1/4, 1, 1]","blueberries, granulated sugar, vanilla yogurt,...",4.5,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39,Biryani,25 minutes,6.0,Chicken Breast,[],"saffron, milk, hot green chili peppers, onions...",3.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,[Soak saffron in warm milk for 5 minutes and p...
2,40,Best Lemonade,5 minutes,4.0,Beverages,[],"sugar, lemons, rind of, lemon, zest of, fresh ...",4.5,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41,Carina's Tofu-Vegetable Kebabs,20 minutes,2.0,Soy/Tofu,"[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1/2, 1/...","extra firm tofu, eggplant, zucchini, mushrooms...",4.5,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,"[Drain the tofu, carefully squeezing out exces..."
4,42,Cabbage Soup,30 minutes,4.0,Vegetable,"[46, 4, 1, 2, 1]","plain tomato juice, cabbage, onion, carrots, c...",4.5,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,"[Mix everything together and bring to a boil.,..."


In [129]:
print(data_cleaned_copy.shape)

(339121, 18)


4. Filtering Nutritional Information for Recommendations

In [135]:
# Nutritional maximum thresholds for a single meal (assuming 3 meals a day)
healthy_thresholds_per_meal = {
    'FatContent': 70 / 3,  # Fat content per meal
    'SaturatedFatContent': 22 / 3,  # Saturated fat per meal
    'CholesterolContent': 300 / 3,  # Cholesterol per meal
    'SodiumContent': 2300 / 3,  # Sodium per meal
    'CarbohydrateContent': 325 / 3,  # Carbohydrates per meal
    'FiberContent': 25 / 3,  # Fiber per meal
    'SugarContent': 50 / 3,  # Sugar per meal
    'ProteinContent': 175 / 3  # Protein per meal
}

# Nutritional thresholds for "Moderate" and "Unhealthy"
moderate_thresholds_per_meal = {
    'FatContent': 70 / 2,  # Fat content for moderate (between healthy and unhealthy)
    'SaturatedFatContent': 22 / 2,  # Saturated fat for moderate
    'CholesterolContent': 300 / 2,  # Cholesterol for moderate
    'SodiumContent': 2300 / 2,  # Sodium for moderate
    'CarbohydrateContent': 325 / 2,  # Carbohydrates for moderate
    'FiberContent': 25 / 2,  # Fiber for moderate
    'SugarContent': 50 / 2,  # Sugar for moderate
    'ProteinContent': 175 / 2  # Protein for moderate
}

# Create a copy of the original dataset to preserve the data_cleaned
data_prepared = data_cleaned_copy.copy()

# Initialize a new column for health status
data_prepared['HealthStatus'] = 'Healthy'  # Assume healthy by default

# Define a function to check health status based on nutritional content
def check_health_status(row, healthy_thresholds, moderate_thresholds):
    for column in healthy_thresholds:
        if column in row:
            # Check if the recipe exceeds the "Unhealthy" threshold
            if row[column] > moderate_thresholds[column]:
                return f"Unhealthy because of : {column}"
            # Check if the recipe is within the moderate range
            elif row[column] > healthy_thresholds[column]:
                return f"Moderate because of :{column}"

                
    return row['HealthStatus']  # Return 'Healthy' if within limits

# Apply the function to each row of the dataset
data_prepared['HealthStatus'] = data_prepared.apply(
    lambda row: check_health_status(row, healthy_thresholds_per_meal, moderate_thresholds_per_meal), axis=1
)

# Display the filtered data with health status
print("\nData Filtered Based on Nutritional Information (per meal):")
print(data_prepared.describe())  # Descriptive statistics for the filtered data
print(data_prepared.shape)  # The shape of the filtered data



Data Filtered Based on Nutritional Information (per meal):
       RecipeId  RecipeServings  AggregatedRating  Calories  FatContent  \
count 339121.00       339121.00         339121.00 339121.00   339121.00   
mean  279556.45            8.61              4.63    369.50       18.64   
std   155264.75          114.40              0.46    876.35       31.85   
min       38.00            1.00              1.00      0.00        0.00   
25%   147522.00            4.00              4.63    165.60        5.30   
50%   282033.00            6.00              4.63    289.60       12.60   
75%   414219.00            8.00              5.00    459.40       23.70   
max   541381.00        32767.00              5.00 434360.20     4701.10   

       SaturatedFatContent  CholesterolContent  SodiumContent  \
count            339121.00           339121.00      339121.00   
mean                  7.13               72.04         577.10   
std                  12.30              251.48        2670.85   
min 

In [136]:
# Example of viewing the health status column
print("\nHealth Status for Each Recipe (per meal):")
data_prepared.head()


Health Status for Each Recipe (per meal):


Unnamed: 0,RecipeId,Name,CookTime,RecipeServings,RecipeCategory,RecipeIngredientQuantities,RecipeIngredientParts,AggregatedRating,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions,HealthStatus
0,38,Low-Fat Berry Blue Frozen Dessert,1 day,4.0,Frozen Desserts,"[4, 1/4, 1, 1]","blueberries, granulated sugar, vanilla yogurt,...",4.5,170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2,"[Toss 2 cups berries with sugar., Let stand fo...",Unhealthy because of : SugarContent
1,39,Biryani,25 minutes,6.0,Chicken Breast,[],"saffron, milk, hot green chili peppers, onions...",3.0,1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,[Soak saffron in warm milk for 5 minutes and p...,Unhealthy because of : FatContent
2,40,Best Lemonade,5 minutes,4.0,Beverages,[],"sugar, lemons, rind of, lemon, zest of, fresh ...",4.5,311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3,"[Into a 1 quart Jar with tight fitting lid, pu...",Unhealthy because of : SugarContent
3,41,Carina's Tofu-Vegetable Kebabs,20 minutes,2.0,Soy/Tofu,"[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1/2, 1/...","extra firm tofu, eggplant, zucchini, mushrooms...",4.5,536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,"[Drain the tofu, carefully squeezing out exces...",Moderate because of :FatContent
4,42,Cabbage Soup,30 minutes,4.0,Vegetable,"[46, 4, 1, 2, 1]","plain tomato juice, cabbage, onion, carrots, c...",4.5,103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3,"[Mix everything together and bring to a boil.,...",Moderate because of :SodiumContent


In [137]:
data_prepared[data_prepared['HealthStatus'].str.startswith('U')].shape

(90467, 19)

Deployment

In [138]:
# Save data for deployment
data_prepared.to_csv('cleaned_recipes_.csv', index=False)
print("Data Saved for Deployment.")

Data Saved for Deployment.
