<a href="https://colab.research.google.com/github/zoraizmohammad/FlavorFlow/blob/main/data/dataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Processing**

**Data Insertion**

View Tasks here: https://github.com/zoraizmohammad/FlavorFlow/blob/main/weeklyTasks/week2.md#to-complete-step-1-data-collection-and-preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#RUN - Has all imports
import pandas as pd
import nltk
from nltk.corpus import words, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from quantulum3 import parser
from fuzzywuzzy import fuzz, process
import re

In [None]:
def view(file_path, num_columns):
  df = pd.read_csv(file_path)
  print(df.iloc[:, :num_columns])

In [None]:
raw_annotation = pd.read_csv('/content/drive/MyDrive/FlavorSync/Data/full_dataset.csv')
# Based on who's using copy and paste in path. Refrence:
# RP MZ: /content/drive/MyDrive/FlavorSync/Data/full_dataset.csv
# RP GM:
# RP RM:

In [None]:
raw_annotation

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."
...,...,...,...,...,...,...,...
2231137,2231137,Sunny's Fake Crepes,"[""1/2 cup chocolate hazelnut spread (recommend...","[""Spread hazelnut spread on 1 side of each tor...",www.foodnetwork.com/recipes/sunny-anderson/sun...,Recipes1M,"[""chocolate hazelnut spread"", ""tortillas"", ""bu..."
2231138,2231138,Devil Eggs,"[""1 dozen eggs"", ""1 paprika"", ""1 salt and pepp...","[""Boil eggs on medium for 30mins."", ""Then cool...",cookpad.com/us/recipes/355411-devil-eggs,Recipes1M,"[""eggs"", ""paprika"", ""salt"", ""choice"", ""miracle..."
2231139,2231139,Extremely Easy and Quick - Namul Daikon Salad,"[""150 grams Daikon radish"", ""1 tbsp Sesame oil...","[""Julienne the daikon and squeeze out the exce...",cookpad.com/us/recipes/153324-extremely-easy-a...,Recipes1M,"[""radish"", ""Sesame oil"", ""White sesame seeds"",..."
2231140,2231140,Pan-Roasted Pork Chops With Apple Fritters,"[""1 cup apple cider"", ""6 tablespoons sugar"", ""...","[""In a large bowl, mix the apple cider with 4 ...",cooking.nytimes.com/recipes/1015164,Recipes1M,"[""apple cider"", ""sugar"", ""kosher salt"", ""bay l..."


## **Data Cleaning & Export**

In [None]:
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/FlavorSync/Data/full_dataset.csv')

#Remove Duplicates
# Filter by source and remove duplicate recipes
data = data[data['source'] == "Gathered"].drop_duplicates()

# Normalize Text
# Download necessary NLTK data
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Create a lemmatizer instance
lemmatizer = WordNetLemmatizer()

# Base ingredient names
base_ingredients = [
    "bell pepper", "chili", "cilantro", "eggplant", "zucchini", "potato", "tomato",
    "onion", "garlic", "carrot", "mushroom", "cabbage", "lettuce", "spinach"
]

# Ingredient mapping dictionary
ingredient_mapping = {ingredient: ingredient for ingredient in base_ingredients}

# Function to automate ingredient mapping using fuzzy matching
def create_ingredient_mapping(ingredient_list, base_ingredients, threshold=80):
    for ingredient in ingredient_list:
        # Find the best match from the base ingredients using fuzzy matching
        match, score = process.extractOne(ingredient, base_ingredients)
        if score >= threshold:  # If the similarity score is above the threshold
            ingredient_mapping[ingredient] = match

# Get a list of unique ingredients from the dataset
unique_ingredients = set()
data['ingredients'].apply(lambda x: unique_ingredients.update(word_tokenize(x.lower())))

# Generate ingredient mappings
create_ingredient_mapping(unique_ingredients, base_ingredients)

# Function to normalize ingredients
def normalize_ingredients(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase the text
    normalized_tokens = []

    for token in tokens:
        # Lemmatize each token and map using the ingredient dictionary
        lemmatized_word = lemmatizer.lemmatize(token)
        if lemmatized_word in ingredient_mapping:
            normalized_tokens.append(ingredient_mapping[lemmatized_word])
        else:
            normalized_tokens.append(lemmatized_word)

    return ' '.join(normalized_tokens)

# Apply normalization to the 'ingredients' column
data['ingredients'] = data['ingredients'].apply(normalize_ingredients)

#Spell-Check and Correct Typos
english_words = set(words.words())

def correct_spelling(text):
    tokens = word_tokenize(text)
    corrected_tokens = [token if token in english_words else token for token in tokens]
    return ' '.join(corrected_tokens)

# Apply spell-check to the 'steps' column
data['steps'] = data['steps'].apply(correct_spelling)

# Standardize Units
def standardize_units(text):
    quantities = parser.parse(text)
    standardized_text = " ".join([str(quantity) for quantity in quantities])
    return standardized_text

# Apply unit standardization
data['ingredients'] = data['ingredients'].apply(standardize_units)

# Save the cleaned data
data.to_csv("/content/drive/MyDrive/FlavorSync/Data/cleaned_data.csv", index=False)

data.head()


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


# **Tokenize Ingredients & Steps**

In [None]:
# Load the cleaned data
data = pd.read_csv("/content/drive/MyDrive/FlavorSync/Data/cleaned_data.csv")

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')

# Step 1: Ingredients Tokenization
def tokenize_ingredient(ingredient):
    # Regular expression to capture quantity, unit, and ingredient name
    pattern = r'(?P<quantity>\d*\.?\d+)?\s*(?P<unit>\b\w+\b)?\s*(?P<ingredient>.+)'
    match = re.match(pattern, ingredient)

    if match:
        quantity = match.group('quantity') if match.group('quantity') else None
        unit = match.group('unit') if match.group('unit') else None
        ingredient_name = match.group('ingredient').strip() if match.group('ingredient') else None
        return {'quantity': quantity, 'unit': unit, 'ingredient': ingredient_name}
    return {'quantity': None, 'unit': None, 'ingredient': ingredient}

# Apply the tokenization to the 'ingredients' column
data['ingredients_tokenized'] = data['ingredients'].apply(lambda x: [tokenize_ingredient(ing) for ing in x.split(',')])

# Step 2: Steps Tokenization
# Define common cooking actions
cooking_actions = [
    "chop", "saute", "bake", "fry", "boil", "mix", "stir", "grill", "roast",
    "slice", "dice", "mince", "blend", "whisk", "season", "cook", "serve"
]

def tokenize_step(step):
    # Tokenize the step into words
    tokens = word_tokenize(step.lower())

    # Find actions and associated ingredients
    actions = [action for action in cooking_actions if action in tokens]
    related_ingredients = [token for token in tokens if token not in cooking_actions]

    return {'actions': actions, 'related_ingredients': related_ingredients}

# Apply the tokenization to the 'steps' column
data['steps_tokenized'] = data['steps'].apply(tokenize_step)

# Save the tokenized data
data.to_csv("/content/drive/MyDrive/FlavorSync/Data/tokenized_data.csv", index=False)

data.head()

# **Markov Chain State Modeling**

In [None]:
# Load the tokenized data
data = pd.read_csv("/content/drive/MyDrive/FlavorSync/Data/tokenized_data.csv")

# Step 1: Assign State IDs to Ingredients
# Flatten all ingredients into a single list
all_ingredients = set()
for ingredient_list in data['ingredients_tokenized']:
    ingredient_list = eval(ingredient_list)  # Convert string representation to list of dicts
    for item in ingredient_list:
        if item['ingredient']:
            all_ingredients.add(item['ingredient'])

# Create a mapping of ingredients to state IDs
ingredient_states = {ingredient: idx for idx, ingredient in enumerate(sorted(all_ingredients), start=1)}

# Step 2: Assign State IDs to Cooking Actions
# Extract all unique cooking actions from the steps
all_actions = set()
for step in data['steps_tokenized']:
    step_data = eval(step)  # Convert string representation to dict
    for action in step_data['actions']:
        all_actions.add(action)

# Create a mapping of actions to state IDs
action_states = {action: idx for idx, action in enumerate(sorted(all_actions), start=1)}

# Step 3: Create Mappings for Ingredients and Steps
# Assign state IDs to each ingredient in the 'ingredients_tokenized' column
def assign_ingredient_states(ingredient_list):
    ingredient_list = eval(ingredient_list)
    return [ingredient_states[item['ingredient']] for item in ingredient_list if item['ingredient']]

data['ingredient_states'] = data['ingredients_tokenized'].apply(assign_ingredient_states)

# Assign state IDs to each action in the 'steps_tokenized' column
def assign_action_states(step_data):
    step_data = eval(step_data)
    return [action_states[action] for action in step_data['actions']]

data['action_states'] = data['steps_tokenized'].apply(assign_action_states)

# Save the data with state IDs
data.to_csv("/content/drive/MyDrive/FlavorSync/Data/states_data.csv", index=False)

# Display the state mappings for reference
ingredient_states, action_states


# **Annotate Data with Cuisines and Dietary Tags**

In [None]:
# Load the data with state IDs
data = pd.read_csv("/content/drive/MyDrive/FlavorSync/Data/states_data.csv")

# Expanded keyword-based rules for cuisine tagging
cuisine_keywords = {
    "Italian": ["spaghetti", "pasta", "marinara", "mozzarella", "risotto", "parmesan", "basil", "gnocchi", "focaccia"],
    "Mexican": ["taco", "quesadilla", "guacamole", "salsa", "enchilada", "jalapeno", "chipotle", "mole", "tamale"],
    "Indian": ["curry", "masala", "paneer", "tandoori", "naan", "dal", "biryani", "samosa", "chutney", "ghee"],
    "Chinese": ["soy sauce", "noodles", "dumpling", "fried rice", "tofu", "szechuan", "hoisin", "wonton", "dim sum"],
    "French": ["baguette", "croissant", "ratatouille", "brie", "crepes", "coq au vin", "bouillabaisse", "quiche"],
    "Japanese": ["sushi", "sashimi", "miso", "udon", "ramen", "teriyaki", "wasabi", "tempura", "matcha", "yakitori"],
    "Thai": ["coconut milk", "lemongrass", "curry paste", "pad thai", "tom yum", "fish sauce", "basil", "sticky rice"],
    "Greek": ["feta", "tzatziki", "gyro", "souvlaki", "moussaka", "olives", "spanakopita", "dolma", "baklava"],
    "Middle Eastern": ["hummus", "falafel", "shawarma", "tahini", "sumac", "pita", "za'atar", "labneh", "tabbouleh"],
    "Spanish": ["paella", "tapas", "chorizo", "gazpacho", "saffron", "manchego", "tortilla", "jamón", "patatas bravas"],
    "Korean": ["kimchi", "bulgogi", "gochujang", "bibimbap", "kimbap", "doenjang", "samgyeopsal", "japchae", "soju"],
    "Vietnamese": ["pho", "banh mi", "spring roll", "nuoc cham", "fish sauce", "lemongrass", "rice paper", "vermicelli"],
    "Caribbean": ["jerk", "plantain", "callaloo", "ackee", "curry goat", "rum", "coconut", "conch", "pigeon peas"],
    "Ethiopian": ["injera", "doro wat", "berbere", "shiro", "kitfo", "niter kibbeh", "teff", "lentils", "collard greens"],
    "Moroccan": ["couscous", "tagine", "harissa", "ras el hanout", "preserved lemon", "mint tea", "saffron", "dates"],
    "Turkish": ["kebab", "baklava", "lokum", "borek", "simit", "manti", "hummus", "dolma", "raki", "yogurt"],
    "Brazilian": ["feijoada", "pão de queijo", "brigadeiro", "açai", "chimichurri", "tapioca", "farofa", "guarana"],
    "Filipino": ["adobo", "sinigang", "lumpia", "halo-halo", "lechon", "pandesal", "tocino", "kare-kare", "balut"],
    "Russian": ["borscht", "pelmeni", "blini", "kvass", "beetroot", "smetana", "pirozhki", "caviar", "vodka"],
    "German": ["bratwurst", "sauerkraut", "pretzel", "schnitzel", "spätzle", "strudel", "bier", "mustard"],
    "African": ["jollof rice", "fufu", "egusi", "koki", "bunny chow", "sosatie", "couscous", "baobab", "grilled fish"],
    "American": ["burger", "hot dog", "BBQ", "mac and cheese", "fried chicken", "apple pie", "pancakes", "cornbread"],
    "British": ["fish and chips", "shepherd's pie", "yorkshire pudding", "scones", "bangers and mash", "custard"],
    "Australian": ["vegemite", "lamington", "barbie", "pavlova", "meat pie", "tim tam", "kangaroo", "anzac biscuit"]
}

# Expanded keyword-based rules for dietary tagging
dietary_keywords = {
    "Vegan": ["tofu", "tempeh", "seitan", "plant-based", "vegan", "nutritional yeast", "jackfruit", "lentils"],
    "Vegetarian": ["cheese", "egg", "paneer", "vegetarian", "yogurt", "butter", "honey", "milk"],
    "Gluten-Free": ["gluten-free", "quinoa", "rice", "cornmeal", "almond flour", "buckwheat", "sorghum", "tapioca"],
    "Keto": ["avocado", "bacon", "cheese", "almond flour", "low-carb", "butter", "cream", "olive oil", "zucchini"],
    "Paleo": ["grass-fed", "wild-caught", "almond flour", "coconut", "honey", "sweet potato", "ghee", "bone broth"],
    "Low-FODMAP": ["zucchini", "carrot", "banana", "potato", "quinoa", "cucumber", "spinach", "blueberries"],
    "Pescatarian": ["salmon", "tuna", "shrimp", "mackerel", "fish", "seafood", "sardines", "trout"],
    "Nut-Free": ["nut-free", "seed", "sunflower butter", "sesame", "pumpkin seeds", "nut allergy"],
    "Dairy-Free": ["dairy-free", "almond milk", "coconut milk", "oat milk", "lactose-free", "soy milk"],
    "Soy-Free": ["soy-free", "coconut aminos", "sunflower oil", "ghee", "olive oil", "fish oil"],
    "Whole30": ["whole30", "cauliflower rice", "coconut oil", "avocado oil", "zoodles", "compliant", "dates"],
    "Halal": ["halal", "zabiha", "permissible", "no pork", "no alcohol"],
    "Kosher": ["kosher", "pareve", "no shellfish", "no pork", "kosher salt", "matzo"]
}

# Function to tag cuisine based on keywords
def tag_cuisine(ingredients_text):
    for cuisine, keywords in cuisine_keywords.items():
        if any(keyword.lower() in ingredients_text.lower() for keyword in keywords):
            return cuisine
    return "Other"

# Function to tag dietary information based on keywords
def tag_dietary(ingredients_text):
    tags = []
    for diet, keywords in dietary_keywords.items():
        if any(keyword.lower() in ingredients_text.lower() for keyword in keywords):
            tags.append(diet)
    return tags if tags else ["None"]

# Apply the tagging functions to the data
data['cuisine'] = data['ingredients'].apply(tag_cuisine)
data['dietary_tags'] = data['ingredients'].apply(tag_dietary)

# Manual Verification: Extract a sample for verification
sample_data = data.sample(10)  # Adjust the sample size as needed for verification

# Save the annotated data
data.to_csv("/content/drive/MyDrive/FlavorSync/Data/annotated_data.csv", index=False)

# Display the sample data for manual verification
sample_data


# **DataFrame Creation**

Data Structure:
- recipe_id: Unique identifier for each recipe.
- ingredient: The name of the ingredient used in the recipe.
- quantity: The amount of the ingredient, converted to a standard format.
- unit: The unit of measurement for the ingredient.
- step: List of actions associated with the recipe (e.g., chop, mix, bake).
- cuisine: The cuisine classification of the recipe (e.g., Italian, Chinese).
- nutrition: Placeholder for nutritional information (if available).
- tags: List of dietary tags associated with the recipe (e.g., Vegan, Gluten-Free).

Processing Assumptions:
- Quantities are converted to numeric values and missing quantities are set to 0.
- Units are standardized to a consistent format for modeling.
- Nutrition information is currently unavailable and marked as 'unknown'.
- Missing values in ingredients, units, or nutrition fields are handled appropriately.

In [None]:

# Load the annotated data
data = pd.read_csv("/content/drive/MyDrive/FlavorSync/Data/annotated_data.csv")

# Step 1: Structure Data in a DataFrame
# Extract unique recipe IDs
data['recipe_id'] = data.index + 1  # Assign unique IDs based on the index

# Split ingredients into separate components
structured_data = []

for idx, row in data.iterrows():
    ingredients_list = eval(row['ingredients_tokenized'])  # Convert string to list of dicts
    steps_list = eval(row['steps_tokenized'])  # Convert string to dict
    for ingredient in ingredients_list:
        structured_data.append({
            'recipe_id': row['recipe_id'],
            'ingredient': ingredient['ingredient'],
            'quantity': ingredient['quantity'],
            'unit': ingredient['unit'],
            'step': steps_list['actions'],
            'cuisine': row['cuisine'],
            'nutrition': None,  # Placeholder for nutrition info if available
            'tags': row['dietary_tags']
        })

# Create a structured DataFrame
df = pd.DataFrame(structured_data)

# Step 2: Data Quality Checks
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

# Fill missing values or handle them appropriately
df['quantity'].fillna(0, inplace=True)  # Assuming 0 for missing quantities
df['unit'].fillna("unknown", inplace=True)  # Placeholder for unknown units
df['nutrition'].fillna("unknown", inplace=True)  # Placeholder for missing nutrition info

# Check for outliers in 'quantity'
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce')  # Convert to numeric
outliers = df[(df['quantity'] < 0) | (df['quantity'] > 1000)]  # Example check for outliers
print("Outliers:\n", outliers)

# Standardize units (if not done earlier)
# Example: Converting 'tablespoon' to 'tbsp' or 'grams' to 'g'
df['unit'] = df['unit'].replace({
    'tablespoon': 'tbsp',
    'teaspoon': 'tsp',
    'kilogram': 'kg',
    'liter': 'l',
    'ounce': 'oz',
    # Add more conversions as needed
})

# Step 3: Save Data
# Export cleaned and structured data to a .csv file
df.to_csv("/mnt/data/structured_data.csv", index=False)

# Display the first few rows of the structured DataFrame
df.head()
