## data.py

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd



def import_data():
    train = pd.read_json("/content/drive/MyDrive/Colab Notebooks/data/cuisine_data/train.json")
    test = pd.read_json("/content/drive/MyDrive/Colab Notebooks/data/cuisine_data/test.json")
    return pd.concat([train,test],axis=0)

def import_recipes_main():
    data_path_ar = "/content/drive/MyDrive/Colab Notebooks/data/recipes_data/recipes_raw_nosource_ar.json"
    data_path_epi = "/content/drive/MyDrive/Colab Notebooks/data/recipes_data/recipes_raw_nosource_epi.json"
    data_path_fn = "/content/drive/MyDrive/Colab Notebooks/data/recipes_data/recipes_raw_nosource_fn.json"

    data =  pd.concat([pd.read_json(data_path_ar, orient='index'), pd.read_json(data_path_epi, orient='index'), pd.read_json(data_path_fn, orient='index')])
    data = data.reset_index()
    data = data.drop(columns=['picture_link', 'index'])
    return data

In [3]:
cuisine = import_data()
cuisine.iloc[0:3,:]

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."


In [4]:
recipes = import_recipes_main()
recipes.head()

Unnamed: 0,title,ingredients,instructions
0,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves AD...","Place the chicken, butter, soup, and onion in ..."
1,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ..."
2,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar ADVERTISEMENT, 1/2...",Preheat oven to 350 degrees F (175 degrees C)....
3,Best Chocolate Chip Cookies,"[1 cup butter, softened ADVERTISEMENT, 1 cup w...",Preheat oven to 350 degrees F (175 degrees C)....
4,Homemade Mac and Cheese Casserole,[8 ounces whole wheat rotini pasta ADVERTISEME...,Preheat oven to 350 degrees F. Line a 2-quart ...


In [5]:
recipes.shape

(124647, 3)

## feature_engineering.py

In [6]:
import nltk
import re
import pandas as pd
from sklearn import feature_extraction, model_selection, pipeline, manifold, preprocessing

# from src.recommendation_engine.data import import_data

additional_stop_words = ["advertisement", "advertisements",
                         "cup", "cups",
                         "tablespoon", "tablespoons",
                         "teaspoon", "teaspoons",
                         "ounce", "ounces",
                         "salt",
                         "pepper",
                         "pound", "pounds",
                         ]

nltk.download('wordnet')
nltk.download("stopwords")

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()

    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]

    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)

    ## Remove digits
    text = ''.join([i for i in text if not i.isdigit()])

    ## remove mutliple space
    text = re.sub(' +', ' ', text)

    return text

def process_data():
    dataset = import_data()

    def processing(row):
        ls = row['ingredients']
        return ' '.join(ls)

    dataset['ingredients'] = dataset.apply(lambda x: processing(x), axis=1)
    dataset.dropna(inplace=True)
    dataset = dataset.drop(columns=['id']).reset_index(drop=True)

    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    dataset["ingredients_query"] = dataset["ingredients"].apply(lambda x:
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True,
          lst_stopwords=stop_word_list))
    return dataset

def create_embeddings(dataset):
    ## Tf-Idf (advanced variant of BoW)
    vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

    corpus = dataset["ingredients_query"]
    vectorizer.fit(corpus)
    embedded_ingredients = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_

    ## Chi squarred correlation embeddings reduction
    labels = dataset["cuisine"]
    names = vectorizer.get_feature_names()
    p_value_limit = 0.95
    dtf_features = pd.DataFrame()

    for cat in np.unique(labels):
        chi2, p = feature_selection.chi2(embedded_ingredients, labels==cat)
        dtf_features = dtf_features.append(pd.DataFrame(
                       {"feature":names, "score":1-p, "labels":cat}))
        dtf_features = dtf_features.sort_values(["labels","score"],
                        ascending=[True,False])
        dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
    names = dtf_features["feature"].unique().tolist()

    ## Check the main ingredients
    for cat in np.unique(labels):
        print("# {}:".format(cat))
        print("  . selected features:",len(dtf_features[dtf_features["labels"]==cat]))
        print("  . top features:", ",".join(dtf_features[dtf_features["labels"]==cat]["feature"].values[:10]))
        print(" ")

    ## New embeddings
    vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=names)
    vectorizer.fit(corpus)
    embedded_ingredients = vectorizer.transform(corpus)
    dic_vocabulary = vectorizer.vocabulary_

    return vectorizer

def process_recipes(data):
    # list of stopwords
    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    data["ingredients_query"] = data["ingredients"].apply(lambda x:
            utils_preprocess_text(x, flg_stemm=False, flg_lemm=True,
            lst_stopwords=stop_word_list))
    return data

def get_tokenize_text(input_text):
    # list of stopwords
    stop_word_list = nltk.corpus.stopwords.words("english")

    # Extend list of stop words
    stop_word_list.extend(additional_stop_words)

    return utils_preprocess_text(input_text, flg_stemm=False, flg_lemm=True, lst_stopwords=stop_word_list)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## create_model.py

In [7]:
import numpy as np

In [18]:
## for processing
import re
import nltk
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
## model & processing libraries
from sklearn import feature_selection
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn import utils
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle
## DB accesses
import sqlite3 as sq


MODEL_PATH = "/content/drive/MyDrive/"
MODEL_EMBEDDINGS_PATH = ('/content/drive/MyDrive/Colab Notebooks/similarity_embeddings/')
CUISINE_CLASSES = ['brazilian','british','cajun_creole','chinese','filipino','french','greek','indian','irish','italian','jamaican','japanese','korean','mexican','moroccan','russian','southern_us','spanish','thai','vietnamese']
os.makedirs(MODEL_PATH, exist_ok=True)
os.makedirs(MODEL_EMBEDDINGS_PATH, exist_ok=True)

## Save to file in the current working directory
def save_pkl(file, pkl_filename):
    with open(pkl_filename, 'wb') as pkl_file:
        pickle.dump(file, pkl_file)

def compute_performances(predicted, predicted_prob, y_test):

    classes = np.unique(y_test)
    y_test_array = pd.get_dummies(y_test, drop_first=False).values

    ## Accuracy, Precision, Recall
    accuracy = metrics.accuracy_score(y_test, predicted)
    balance_accuracy = metrics.balanced_accuracy_score(y_test, predicted)
    auc = metrics.roc_auc_score(y_test, predicted_prob,
                                multi_class="ovr")
    print("Balanced Accuracy:",  round(balance_accuracy,2))
    print("Accuracy:",  round(accuracy,2))
    print("Auc:", round(auc,2))
    print("Detail:")
    print(metrics.classification_report(y_test, predicted))

    '''## Plot confusion matrix
    cm = metrics.confusion_matrix(y_test, predicted)
    fig, ax = plt.subplots(figsize=(10,10), dpi=100)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues,
                cbar=False)
    ax.set(xlabel="Pred", ylabel="True", xticklabels=classes,
           yticklabels=classes, title="Confusion matrix")
    plt.yticks(rotation=0)
    fig, ax = plt.subplots(figsize=(10,10), dpi=100, nrows=1, ncols=2)
    ## Plot roc
    for i in range(len(classes)):
        fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],
                               predicted_prob[:,i])
        ax[0].plot(fpr, tpr, lw=3,
                  label='{0} (area={1:0.2f})'.format(classes[i],
                                  metrics.auc(fpr, tpr))
                   )
    ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
    ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05],
              xlabel='False Positive Rate',
              ylabel="True Positive Rate (Recall)",
              title="Receiver operating characteristic")
    ax[0].legend(loc="lower right")
    ax[0].grid(True)
    ## Plot precision-recall curve
    for i in range(len(classes)):
        precision, recall, thresholds = metrics.precision_recall_curve(
                     y_test_array[:,i], predicted_prob[:,i])
        ax[1].plot(recall, precision, lw=3,
                   label='{0} (area={1:0.2f})'.format(classes[i],
                                      metrics.auc(recall, precision))
                  )
    ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall',
              ylabel="Precision", title="Precision-Recall curve")
    ax[1].legend(loc="best")
    ax[1].grid(True)
    plt.show()'''


def d2v_embeddings(data):
    data = data['ingredients_query'].tolist()
    tagged_data = [TaggedDocument(words=row.split(), tags=[str(index)]) for index, row in enumerate(data)]

    max_epochs = 20
    vec_size = 100
    alpha = 0.025

    model_embedding = Doc2Vec(vector_size=vec_size,
                        alpha=alpha,
                        min_alpha=0.00025,
                        min_count=1,
                        dm =1)

    model_embedding.build_vocab(tagged_data)

    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model_embedding.train(tagged_data,
                    total_examples=model_embedding.corpus_count,
                    epochs=10)


    return model_embedding

def train_model_embeddings():
    db = sq.connect('/content/drive/MyDrive/recipes.db')
    cursor = db.cursor()

    for cuisine in CUISINE_CLASSES:
        sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
        data = pd.read_sql(sql_query, db, params=(cuisine,))

        model_embedding = d2v_embeddings(data)
        save_pkl(model_embedding, os.path.join(MODEL_EMBEDDINGS_PATH, f'd2v_{cuisine}.pkl'))

In [9]:
db = sq.connect('/content/drive/MyDrive/recipes.db')
cursor = db.cursor()

for cuisine in CUISINE_CLASSES:
    sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
    data = pd.read_sql(sql_query, db, params=(cuisine,))
data.head()

Unnamed: 0,title,instructions,ingredients,ingredients_query
0,Honey-Garlic Slow Cooker Chicken Thighs,Lay chicken thighs into the bottom of a 4-quar...,"['4 skinless, boneless chicken thighs ADVERTIS...",skinless boneless chicken thigh soy sauce ket...
1,Souvlaki,"In a large glass bowl, mix together lemon juic...","['1 lemon, juiced ADVERTISEMENT', '1/4 cup oli...",lemon juiced olive oil soy sauce dried oregan...
2,Heather's Grilled Salmon,"Whisk together the brown sugar, olive oil, soy...","['1/4 cup brown sugar ADVERTISEMENT', '1/4 cup...",brown sugar olive oil soy sauce lemon dried t...
3,Homemade Refrigerator Pickles,"In a medium saucepan over medium heat, bring v...",['1 cup distilled white vinegar ADVERTISEMENT'...,distilled white vinegar white sugar sliced cu...
4,Pork and Shrimp Pancit,Soak the rice noodles in warm water for 20 min...,['1 (6.75 ounce) package rice noodles ADVERTIS...,package rice noodle vegetable oil divided sma...


## Functions to populate the Recipe Dataset with cuisine label by using the BERT multiclass Classifier.
### Once the Recipe Dataset is appended with the cuisine column, we will create a database called 'recipe.db' using sqlite.


In [10]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [13]:
import sqlite3 as sq
import pandas as pd
import os
from transformers import TFBertModel
import tensorflow as tf
from transformers import BertTokenizer
from tqdm.auto import tqdm
cuisine_model = tf.keras.models.load_model('/content/drive/MyDrive/cuisine_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

MODEL_PATH = 'models/nlp'

def generate_input_data(df, tokenizer):
    ids = np.zeros((len(df), 256))
    masks = np.zeros((len(df), 256))
    for i, text in tqdm(enumerate(df['ingredients_query']), total=len(df)):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids[0]
        masks[i, :] = tokenized_text.attention_mask[0]
    return ids, masks


def make_prediction(model, processed_data, classes=['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian','irish', 'italian', 'jamaican', 'japanese', 'korean', 'mexican', 'moroccan','russian', 'southern_us','spanish', 'thai', 'vietnamese'], top_k=1):
    predictions = []
    #Convert processed_data to input format expected by the model
    for item in processed_data:
        input_data = {
            "input_ids": np.array([item["input_ids"]]),
            "attention_mask": np.array([item["attention_mask"]]),
        }
        probs = model.predict(input_data)[0]
        top_indices = np.argmax(probs)
        top_predictions = classes[top_indices]
        predictions.append(top_predictions)
    return predictions

def create_and_populate_db():
    data = import_recipes_main()
    #Since the Recipe dataset is huge, we split it into a batch of 5
    # 0:24929
    # 24929:49858
    # 49858:74787
    # 74787:99716
    # 99716:124647
    data = data.iloc[99716:124647]
    # Process the data
    data = process_recipes(data)
    data["cuisine"] = ""

    X_input_ids, X_attn_masks = generate_input_data(data, tokenizer)

    processed_data = []
    for i in range(len(data)):
        processed_data.append({
            'input_ids': X_input_ids[i],
            'attention_mask': X_attn_masks[i]
        })
    predictions = make_prediction(cuisine_model, processed_data)

    data["cuisine"] = predictions

    db = sq.connect('/content/drive/MyDrive/recipes.db')
    #Verify dtypes
    for col in data.columns:
        data[col] = data[col].astype('str')

    print(' ------------------ Check data before populating the db ------------------')
    print(data.columns)
    print(data.head())
    print(data.shape)
    data.to_sql('main_recipes', db, if_exists='append')

In [None]:
create_and_populate_db()

  0%|          | 0/24931 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 ------------------ Check data before populating the db ------------------
Index(['title', 'ingredients', 'instructions', 'ingredients_query', 'cuisine'], dtype='object')
                                               title  \
99716               Dutch Oven Confederate Pot Roast   
99717  White Pizza with Chicken, Pesto and Pine Nuts   
99718                                      Shortcake   
99719                               Cranberry Relish   
99720                             Baked Potato Skins   

                                             ingredients  \
99716  ['10 pounds pot roast', '3 slices bacon', 'Oli...   
99717  ['1 store bought pizza crust', '1 cup prepared...   
99718  ['2 cups flour', '4 teaspoons baking powder', ...   
99719  ['1 can drained, crushed pineapple', '2 ounces...   
99720  ['2 large Idaho potatoes, baked and cooled', '...   

                                            instructions  \
99716 

## Function to get data based on type cuisine from the 'recipe.db' to train d2v_embbedings for each cuisine type.

In [30]:
import sqlite3 as sq
import pandas as pd

def get_df_from_db(cuisine):
    db = sq.connect('/content/drive/MyDrive/recipes.db')
    sql_query = "SELECT title, instructions, ingredients, ingredients_query FROM main_recipes WHERE cuisine = ?"
    return pd.read_sql(sql_query, db, params=(cuisine,))

### Proceed to train D2V Embbedings for each cuisine type.

In [20]:
train_model_embeddings()

iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19
iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19




iteration 0
iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6




iteration 7




iteration 8




iteration 9




iteration 10




iteration 11




iteration 12




iteration 13




iteration 14




iteration 15




iteration 16




iteration 17




iteration 18




iteration 19


In [28]:
import os
import pickle
MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks'
MODEL_EMBEDDINGS_PATH = os.path.join(MODEL_PATH, 'similarity_embeddings')
# CUISINE_CLASSES = ['greek','southern_us','filipino','indian','jamaican','spanish','italian','mexican','chinese','british','thai','vietnamese','cajun_creole','brazilian','french','japanese','irish','korean','moroccan','russian']

## Load from file
def load_pkl(pkl_filename):
    with open(pkl_filename, 'rb') as pkl_file:
        return pickle.load(pkl_file)


def get_similar_recipes(input_text, cuisine, top_k=3):
    # Tokenize text
    tokenize_text = get_tokenize_text(input_text).split()

    # Load model from the selected cuisine
    d2v = load_pkl(os.path.join(MODEL_EMBEDDINGS_PATH, f'd2v_{cuisine}.pkl'))

    # Get embeddings
    embeddings = d2v.infer_vector(tokenize_text)
    best_recipes = d2v.dv.most_similar([embeddings]) #gives you top 10 document tags and their cosine similarity

    # Get recipes
    best_recipes_index = [int(output[0]) for output in best_recipes]

    # Get dDtaFrame
    df = get_df_from_db(cuisine)

    return df[df.index.isin(best_recipes_index)].head(top_k)

## Cuisine Multiclass Classifier Using BERT

#### The BERT model is trained in a seperate file, hence we load the downloaded model here for prediction of cuisines

In [22]:
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFBertModel

In [23]:
cuisine_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/cuisine_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }


def make_prediction(model, processed_data, classes=['Brazilian', 'British', 'Cajun Creole', 'Chinese', 'Filipino', 'French', 'Greek', 'Indian','Irish', 'Italian', 'Jamaican', 'Japanese', 'Korean', 'Mexican', 'Moroccan','Russian', 'Southern US','Spanish', 'Thai', 'Vietnamese'], top_k=5):
    probs = model.predict(processed_data)[0]
    top_indices = np.argsort(probs)[-top_k:][::-1]
    top_predictions = [(classes[i], probs[i]) for i in top_indices]
    return top_predictions

def predict_cuisine():
    input_text = input('Enter ingredients here: ')
    processed_data = prepare_data(input_text, tokenizer)
    results = make_prediction(cuisine_model, processed_data=processed_data)

    # Extract cuisine names from results
    cuisines = [cuisine for cuisine, _ in results]

    # Print the cuisine names
    # for cuisine in cuisines:
    print(type(cuisines))
    return cuisines

## Predict Using BERT Classifier

In [32]:
predict_cuisine()

Enter ingredients here: pasta, garlic, parmesan, egg
<class 'list'>


['Italian', 'Southern US', 'Mexican', 'Filipino', 'Cajun Creole']

In [33]:
get_similar_recipes('pasta, garlic, parmesan, egg', 'italian')

Unnamed: 0,title,instructions,ingredients,ingredients_query
566,Three Cheese Baked Pasta,Bring a large pot of lightly salted water to a...,"['1 pound uncooked pasta ADVERTISEMENT', '1 po...",uncooked pasta ground beef onion chopped toma...
1650,Pasta and Garlic,In a large pot of salted water boil pasta unti...,"['1 1/2 pounds pasta ADVERTISEMENT', '1/4 cup ...",pasta olive oil clove crushed garlic taste gr...
2482,College Student's Gourmet Pasta,Cook pasta in boiling salted water until tende...,"['16 ounces pasta ADVERTISEMENT', '2 tablespoo...",pasta butter garlic milk parmesan cheese
