In [None]:
import pandas as pd
import numpy as  np
import re

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#mean Pooling to take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

# loading model from HuggingFace Hub
bi_enc_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
bi_enc_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# tokenizing sentences
encoded_input = bi_enc_tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# computing the token embeddings
with torch.no_grad():
    model_output = bi_enc_model(**encoded_input)

# pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# normalizing embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# print("Sentence embeddings:"
# print(sentence_embeddings)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

cr_enc_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
cr_enc_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')

features = cr_enc_tokenizer(['How many people live in Berlin?', 'How many people live in Berlin?'], ['Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.'],  padding=True, truncation=True, return_tensors="pt")

cr_enc_model.eval()
with torch.no_grad():
    scores = cr_enc_model(**features).logits
    print(scores)



In [None]:
raw_recipe_df = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv")

In [None]:

from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi


from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
mongo_un = user_secrets.get_secret("mongo_un")
mongo_pw = user_secrets.get_secret("mongo_pw")


uri = f"mongodb+srv://{mongo_un}:{mongo_pw}@cluster0.vcbeq3r.mongodb.net/?retryWrites=true&w=majority&appName=cluster0"
mongo_client = MongoClient(uri, server_api=ServerApi('1'))


# MongoDB connection setup
# client = MongoClient("mongodb://localhost:27017/")  # MongoDB URI
db = mongo_client["NDR"]  # database
collection = db["test1"]  # collection


def get_processed_recipe(db_name, collection_name):
    db = mongo_client[db_name]
    collection = db[collection_name]  
    data = collection.find()  # fetching all documents in the collection
    
    # converting the MongoDB cursor to a list of dictionaries
    data_list = list(data)
    
    # also, if needed, converting to a pandas DataFrame
    if len(data_list) > 0:
        df = pd.DataFrame(data_list)
        return df
    else:
        return None

def get_bi_enc_embedding(sentence):
    # print(type(sentence))
    # print(sentence)
    
    encoded_input = bi_enc_tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    
    # computing token embeddings
    with torch.no_grad():
        model_output = bi_enc_model(**encoded_input)
    
    # pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings.squeeze()


def get_cr_enc_score(s1, s2):
    features = cr_enc_tokenizer([[s1, s2]],  padding=True, truncation=True, return_tensors="pt")
    
    cr_enc_model.eval()
    with torch.no_grad():
        scores = cr_enc_model(**features).logits

    return scores


In [None]:
output_list = []

query_df = get_processed_recipe('NDR','test3')


for index, row in query_df.iterrows():

    query_item = {}

    query_item['recipe_id'] = row['recipe_id']

    llm_output = row['llm_output']

    text_between_quotes = []

    if len(text_between_quotes) == 0:
        pattern = r'(?<=\d\.\s)"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    
    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+:\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'Query\s\d+:\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query:\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'Query\s\d+:\s*(.*?)(?=\n\n|Query\s|$)'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+:\s*\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)





    # text_between_quotes = re.findall(r'\*\*Query:\*\* "(.*?)"', llm_output)

    # if len(text_between_quotes) != 5:
    #     text_between_quotes = re.findall(r'\d+\.\s*"(.*?)"', llm_output)
        
    query_item['query_list'] = text_between_quotes

    output_list.append(query_item)

    # output_list.append(text_between_quotes)

# expanded_df = pd.DataFrame(output_list).explode("query_list").reset_index(drop=True)
# expanded_df = expanded_df[~expanded_df['query_list'].isna()]
# raw_recipe_df_temp = raw_recipe_df[raw_recipe_df.id.astype(str).isin(list(expanded_df['recipe_id']))]
# expanded_df = expanded_df.reset_index(drop = True).reset_index().rename(columns = {'index':'query_index'})
# raw_recipe_df_temp = raw_recipe_df_temp.reset_index(drop = True).reset_index().rename(columns = {'index':'recipe_index'})

In [None]:
output_list[13]

In [None]:
print(query_df.iloc[10]['llm_output'])

In [None]:
# expanded_df

In [None]:
# bi_enc_model

# bi_enc_tokenizer

raw_recipe_df_temp['sentence'] = raw_recipe_df_temp.apply(lambda x: str(dict(x)),axis =1)
raw_recipe_df_temp['sentence_embedding'] =  raw_recipe_df_temp['sentence'].apply(lambda x: get_bi_enc_embedding(x) )
expanded_df['query_embedding'] =  expanded_df['query_list'].apply(lambda x: get_bi_enc_embedding(x) )
query_embeddings = torch.stack(list(expanded_df['query_embedding'].values))
sentence_embeddings = torch.stack(list(raw_recipe_df_temp['sentence_embedding'].values))

In [None]:
print(query_embeddings.shape)
print(sentence_embeddings.shape)

query_norm = torch.nn.functional.normalize(query_embeddings, p=2, dim=1)  # [2210, 384]
sentence_norm = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)  # [498, 384]

# pairwise cosine similarity using matrix multiplication

# print(cosine_similarities.shape)  # Output: torch.Size([2210, 498])

In [None]:

cosine_similarities = torch.mm(query_norm, sentence_norm.T)  
cross_query_sentence = pd.merge(raw_recipe_df_temp, expanded_df, how='cross')
cross_query_sentence['similarity'] = cross_query_sentence.apply(lambda x: cosine_similarities[x['query_index'],x['recipe_index']] , axis = 1)


In [None]:
cross_query_sentence['recipe_match'] = (cross_query_sentence['recipe_id'] == cross_query_sentence['id'].astype(str)).astype(int)

In [None]:
cross_query_sentence_sub = cross_query_sentence[cross_query_sentence['query_index'] < 100].copy(deep = True)

In [None]:
top_10 = cross_query_sentence_sub.sort_values(by=["query_index", "similarity"], ascending=[True, False]).groupby('query_index').head(10)
top_1 = cross_query_sentence_sub.sort_values(by=["query_index", "similarity"], ascending=[True, False]).groupby('query_index').head(1)

In [None]:
out_summary = out.groupby('query_index').apply(lambda x: x['recipe_match'].sum()).reset_index()

In [None]:
# cross_query_sentence[(cross_query_sentence['query_index'].isin(list(out_summary[out_summary[0] == 0]['query_index']))) & (cross_query_sentence['recipe_match'] == 1)]

In [None]:
top_1[(top_1['query_index'].isin(list(out_summary[out_summary[0] == 0]['query_index']))) ]

In [None]:
top_1

In [None]:
# cross_query_sentence.to_csv("cross_query_sentence.csv", index = False)

In [None]:
# cross_query_sentence['recipe_match'] = (cross_query_sentence['id'].astype(str) == cross_query_sentence['recipe_id']).astype(int)

In [None]:
# cross_query_sentence_sub.shape

In [None]:
# from tqdm import tqdm

# tqdm.pandas()

# cross_query_sentence_sub['cross_similarity'] = cross_query_sentence_sub.apply(lambda x: get_cr_enc_score(x['sentence'],x['query_list']) , axis = 1)

In [None]:

# cross_query_sentence_sub['cross_similarity_v1'] =  cross_query_sentence_sub['cross_similarity'].apply(lambda x: x[0][0])