In [None]:
# Load libraries
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
import pandas as pd

In [None]:
# Load all ingredients
indonesian_recipes = pd.read_csv("../csv/final/indonesian_recipes.csv")
indonesian_recipes

In [None]:
# Inspect indonesian_recipes df
indonesian_recipes.drop("FlavorProfile", inplace=True, axis=1)
indonesian_recipes

In [None]:
# Create a new DataFrame 'recipes' with the merged relevant columns
separator = " x$x "
recipes = pd.DataFrame({
    "Database_Entry": (
        indonesian_recipes["Title"].astype(str) + separator +
        indonesian_recipes["Ingredients"].str.replace("--", " ", regex=False) + separator +
        indonesian_recipes["DominantFlavor"].astype(str)
    )
})
recipes["Database_Entry"][0]

In [None]:
# Load embedding model for test
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

In [None]:
# Create an embedding document for recipes
db_path = "../vector-db"
recipes.to_csv(f"{db_path}/recipe_document.txt",
                          sep="\n",
                          header=False,
                          index=False
                          )
print("Successfully created recipe_document.txt")

In [None]:
# Split documents
raw = TextLoader(f"{db_path}/recipe_document.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator="\n")
documents = text_splitter.split_documents(raw)

In [None]:
documents[0]

In [None]:
# Load embeddings_model to Chroma db
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)
vector_db = Chroma().from_documents(
    documents,
    embedding=embedding_model,
)

In [None]:
# Ask simple query
test_query = "A simple seafood dish with slightly savoury flavor"
results = vector_db.similarity_search(test_query, k=5)
results

In [None]:
# Create function to retrieve relevant dataframe values per query
def find_recipe(query, top_k=10) -> pd.DataFrame:
    recommendations = vector_db.similarity_search(query, k=50)
    found_recipes = []
    for recipe in recommendations:
        found_recipes += [recipe.page_content.split(" x$x ")[0]]
    return indonesian_recipes[indonesian_recipes["Title"].isin(found_recipes)]

In [None]:
# Call function
find_recipe("A simple chicken recipe that I can make in 5 minutes")