In [1]:
# Install dependencies
!pip install faiss-cpu transformers sentence-transformers datasets

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [2]:
# Import libraries
import os
import json
import pandas as pd
import faiss
import ast
import pickle
import numpy as np
from datasets import Dataset
from google.colab import files
from sentence_transformers import SentenceTransformer
from transformers import pipeline

In [3]:
# Load dataset from Google Drive
!gdown --id 1DKLke-54G6-s2_1ZKcO1jxOiEH7FpXqq

Downloading...
From (original): https://drive.google.com/uc?id=1DKLke-54G6-s2_1ZKcO1jxOiEH7FpXqq
From (redirected): https://drive.google.com/uc?id=1DKLke-54G6-s2_1ZKcO1jxOiEH7FpXqq&confirm=t&uuid=2f9f7ee3-34ec-4acf-b8e7-ecd8b09f9de2
To: /content/dataset.csv
100% 2.29G/2.29G [00:34<00:00, 66.7MB/s]


In [4]:
df = pd.read_csv('dataset.csv', delimiter=',', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER
0,0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [5]:
# Cleaning data
df = df.drop(columns=["Unnamed: 0", "source"])

def clean_list_column(col):
    def safe_parse(x):
        if isinstance(x, str):
            try:
                return " ".join(ast.literal_eval(x))
            except:
                return x
        return ""
    return col.apply(safe_parse)

df["ingredients"] = clean_list_column(df["ingredients"])
df["directions"] = clean_list_column(df["directions"])
df["NER"] = clean_list_column(df["NER"])

df["text"] = df["title"] + ". Ingredients: " + df["ingredients"] + ". Directions: " + df["directions"]

df = df[df["text"].notnull() & (df["text"].str.strip() != "")]

df = df.sample(n=20000, random_state=42).reset_index(drop=True)
df.to_pickle("cleaned_recipes.pkl")
df.head()

Unnamed: 0,title,ingredients,directions,link,NER,text
0,Ice Cream Krispies,12 cup butter 1 cup brown sugar 6 cups crisp r...,Prepare sauce: Cut strawberries in half. In a ...,www.food.com/recipe/ice-cream-krispies-223031,butter brown sugar crisp rice coconut nuts van...,Ice Cream Krispies. Ingredients: 12 cup butter...
1,Croatian Bean Stew With Smoked Pork Ribs,1 2/3 cups pinto beans 1 bay leaf 1 9/16 pound...,Cook beans together with a bay leaf in unsalte...,www.yummly.com/recipe/Croatian-Bean-Stew-With-...,pinto beans bay leaf pork celery root parsley ...,Croatian Bean Stew With Smoked Pork Ribs. Ingr...
2,Glazed Carrots,3 to 4 carrots 1 1/2 Tbsp. butter 1/3 c. brown...,Cook 3 to 4 carrots; cut crosswise in 1-inch p...,www.cookbooks.com/Recipe-Details.aspx?id=1011892,carrots butter brown sugar lemon rind,Glazed Carrots. Ingredients: 3 to 4 carrots 1 ...
3,Moms Pie Dough,4.5 Cups Flour 1.5 Tsp Salt Pinch Baking Powde...,Mix all dry ingredients in a bowl. Add crisco...,www.epicurious.com/recipes/member/views/moms-p...,Flour Salt Baking Powder Sugar Crisco egg vine...,Moms Pie Dough . Ingredients: 4.5 Cups Flour 1...
4,Pretzel Salad Or Dessert,2 c. crushed small thin pretzels (sticks) 3/4 ...,"Mix and press in baking pan, approximately 13 ...",www.cookbooks.com/Recipe-Details.aspx?id=106723,thin pretzels margarine,Pretzel Salad Or Dessert. Ingredients: 2 c. cr...


In [6]:
# Embedding
model = SentenceTransformer("all-MiniLM-L6-v2")
model.save("model")
corpus = df["text"].tolist()
embeddings = model.encode(corpus, convert_to_numpy=True, show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [7]:
# FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [8]:
faiss.write_index(index, "recipe_faiss.index")
with open("recipe_texts.pkl", "wb") as f:
    pickle.dump(corpus, f)