In [None]:
from app.dotenv import base_dir, data_dir
import os, numpy as np
import pandas as pd; 

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('base_dir:', base_dir)
print('data_dir:', data_dir)
# print('dotenv params:', {os.getenv('PARAM1')})


download and parse dataset

In [None]:
from app.load_dataset import download_zip,parse_json,print_recipe


# Check if the file already exists, if not download it
if not os.path.exists(f'{data_dir}/recipes_raw.zip'):# 
    download_zip(data_dir,url = 'https://eightportions.com/recipes_raw.zip')
data = parse_json(zip_file_path = f'{data_dir}/recipes_raw.zip',load_all=False)



for recipe_id, recipe in data.items():
    recipe_series = pd.Series(recipe)

    # recipes_df = pd.DataFrame.from_dict(recipe, orient='index')
    print_recipe(recipe_series)
    break#run only for the first recipe



Standardize data: implement a better structured data model that can accommodate the various attributes of recipes using the above dataset
1. Ensure that each recipe follows the same structure (represented with consistent fields)
2. drop recipes without instructions
3. Flatten ingredients list into separate keys (number of columns according to max value in data)
4. Convert into dataframe (each key into column, each recipe into new row)

In [None]:
from app.preprocess import standardize

df,ingredients_columns = standardize(data.copy(),list(data[0].keys()))
print_recipe(df.loc[0])
df.columns



Normalize Text: 
   1. clean_text: 
       1. remove encoding characters like \xad, \u00ad, seperate punctuation, 
       2. remove words like 'ADVERTISEMENT' and other non-recipe related terms
       3. converting to lower case
       4. remove leading/trailing whitespaces.
   2. split_number_and_word: split a word containing a number and a word (take1.2 -> take 1.2)
   3. split_un_words: split words starting with 'un' into 'un' and the rest of the word ('unsalted' becomes ['un', 'salted']).

In [None]:

from app.preprocess import normalize_text
df = normalize_text(df.copy(), df.columns)

# df = df[['0']]
print(df.columns)
# df.columns




extract entities from the ingredients sentences:
1. extract ingredients (2 teaspoons of salt -> salt is the ingredient)
2. extract units in data and their transformation 
   1. example: teaspoon is a volume unit, all volume units are transformed into cups )
3. convert values in text into numeric values (¼ -> 0.25, )
4. extract numeric values in text (2 teaspoons the value is 2)
5. transform values (2 teaspoons are transformed into cups)
6. rows with no units and values are seasoning and ignored (such as: add salt and pepper by taste)

output: for each ingredient a column with amount and unit; before and after the conversion

left to do: 
1. when theres 1¼ instead of output 1.25 there is [1,0.25]
   a. fix: when converting into numeric values  - search for close value and add it (1+0.25)
2. when having more than 1 value/unit output (example: 2 cans where each is 10 oz)
   1. according to proximity find relations to ingredient (noun) and take the correct value

In [None]:
from app.extract_entities.amounts import extract_entities,measurement_units,nlp
df = extract_entities (df.copy(),ingredients_columns)



In [None]:
# print the first ingridient of the first recepy and its conversion:
print('first ingridient of the first recepy:', df['1'].loc[1])
print('ingredients:', df['1_ingredients'].loc[1])
print('amounts and units after conversion',df['1_converted'].loc[1])


Recreate back the ingridients from the entities
1. Combines the entities to a single column of the full recipe ingridients formated  as sentences
2. The entities are after units and amounts conversion

In [None]:
from app.extract_entities.recreate import recreate_ingrediants_from_entities

df = recreate_ingrediants_from_entities(df,ingredients_columns)                        
df.drop(['picture_link', 'website'],inplace=True,axis = 1)#drop rows with no useful data

print(df.columns)
df.head()



Similarity Calculation and Visualization


1. Prepare the data - stack the recipe ngredients, title, instructions for each recipe  (list od lists)
2. Similarity Metrics - composed of 3 parts:
    1. Cosine Similarity: Measures the cosine of the angle between two vectors. 
    2. Embeddings: vectors are represention of each recipe parts (ingredients, instructions, titles), using Sentence Embeddings (via SBERT model).
    3. Weighted Similarity: Combines different recipe parts similarity with weights to give more importance to certain parts of the recipe.
3. Visualization: heatmap is generated to visually compare similarities between multiple recipes based on the metrics above.

In [None]:
from app.similarity.compute import compute_similarity_matrix,prepare_data
from app.similarity.plot import plot_heatmap
# Prepare the data for the similarity computation by combining the ingredients, title, instructions of the recipe into a single list
data = prepare_data(df)

# Compute the similarity matrix for the 3 recipes
weights = [0.5, 0.3, 0.2]  # For ingredients, title, and instructions
similarity_matrix = compute_similarity_matrix(data, weights)

# Generate a 1000x1000 similarity matrix - for testing purposes
# similarity_matrix = generate_dummy_matrix(size=1000)
plot_heatmap(similarity_matrix, labels=None)

print("Similarity Matrix:")
print(similarity_matrix)

In [None]:
# import pandas as pd
# from collections import Counter

# # Function to get unique words sorted by frequency
# def get_unique_words(df, columns):
#     # Combine all sentences into a single string and convert to lowercase
#     text = ''
#     for column in columns:
#         text += ' '.join(df[column].loc[df[column].notnull()])
    
#     # Split the text into words
#     words = text.split()

#     # Remove digits from the list of words
#     def isDigit(x):
#         try:
#             float(x)
#             return True
#         except ValueError:
#             return False
#     words = [word for word in words if not isDigit(word) ]#word.isdigit()
    
#     # Count the frequency of each word using Counter
#     word_counts = Counter(words)
    
#     # Sort words by frequency in descending order
#     sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
#     # Return a list of words sorted by frequency
#     return sorted_words
# unique_words = get_unique_words(df, columns = [i for i in df.columns if 'ingredients' in i])
# # unique_words = get_unique_words(df,[col for col in df.columns if 'ingredients' in col] )#['instructions','ingredients_1']


