In [None]:
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import re
import numpy as np


from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
mongo_un = user_secrets.get_secret("mongo_un")
mongo_pw = user_secrets.get_secret("mongo_pw")


uri = f"mongodb+srv://{mongo_un}:{mongo_pw}@cluster0.vcbeq3r.mongodb.net/?retryWrites=true&w=majority&appName=cluster0"
# Create a new client and connect to the server
mongo_client = MongoClient(uri, server_api=ServerApi('1'))


# MongoDB connection setup
# client = MongoClient("mongodb://localhost:27017/")  # Update with your MongoDB URI
db = mongo_client["NDR"]  # Replace with your database name
collection = db["test1"]  # Replace with your collection name


def get_processed_recipe(db_name, collection_name):
    db = mongo_client[db_name]  # Replace with your database name
    collection = db[collection_name]  
    data = collection.find()  # Fetch all documents in the collection
    
    # Convert MongoDB cursor to a list of dictionaries
    data_list = list(data)
    
    # If needed, convert to a pandas DataFrame
    if len(data_list) > 0:
        df = pd.DataFrame(data_list)
        return df
    else:
        return None

def get_bi_enc_embedding(sentence):
    # print(type(sentence))
    # print(sentence)
    
    encoded_input = bi_enc_tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = bi_enc_model(**encoded_input)
    
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    return sentence_embeddings.squeeze()


def get_cr_enc_score(s1, s2):
    features = cr_enc_tokenizer([[s1, s2]],  padding=True, truncation=True, return_tensors="pt")
    
    cr_enc_model.eval()
    with torch.no_grad():
        scores = cr_enc_model(**features).logits

    return scores


In [None]:
raw_recipe_df = pd.read_csv("/kaggle/input/food-com-recipes-and-user-interactions/RAW_recipes.csv")

output_list = []

query_df = get_processed_recipe('NDR','test3')


for index, row in query_df.iterrows():

    query_item = {}

    query_item['recipe_id'] = row['recipe_id']

    llm_output = row['llm_output']

    text_between_quotes = []

    if len(text_between_quotes) == 0:
        pattern = r'(?<=\d\.\s)"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    
    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+:\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'Query\s\d+:\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query:\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'Query\s\d+:\s*(.*?)(?=\n\n|Query\s|$)'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)

    if len(text_between_quotes) == 0:
        pattern = r'\*\*Query\s\d+:\s*\*\*\s*"([^"]+)"'
        text_between_quotes = re.findall(pattern, llm_output)





    # text_between_quotes = re.findall(r'\*\*Query:\*\* "(.*?)"', llm_output)

    # if len(text_between_quotes) != 5:
    #     text_between_quotes = re.findall(r'\d+\.\s*"(.*?)"', llm_output)
        
    query_item['query_list'] = text_between_quotes

    output_list.append(query_item)

    # output_list.append(text_between_quotes)

expanded_df = pd.DataFrame(output_list).explode("query_list").reset_index(drop=True)
expanded_df = expanded_df[~expanded_df['query_list'].isna()]
raw_recipe_df_temp = raw_recipe_df[raw_recipe_df.id.astype(str).isin(list(expanded_df['recipe_id']))]
expanded_df = expanded_df.reset_index(drop = True).reset_index().rename(columns = {'index':'query_index'})
raw_recipe_df_temp = raw_recipe_df_temp.reset_index(drop = True).reset_index().rename(columns = {'index':'recipe_index'})
raw_recipe_df_temp['sentence'] = raw_recipe_df_temp.apply(lambda x: str(dict(x)),axis =1)


In [None]:
cross_query_sentence = pd.merge(raw_recipe_df_temp, expanded_df, how='cross')
cross_query_sentence['recipe_match'] = (cross_query_sentence['recipe_id'] == cross_query_sentence['id'].astype(str)).astype(int)
cross_query_sentence_macthed = cross_query_sentence[cross_query_sentence['recipe_match'] == 1]
cross_query_sentence_unmacthed = cross_query_sentence[cross_query_sentence['recipe_match'] == 0]
cross_query_sentence_unmacthed_random =  cross_query_sentence_unmacthed.groupby('query_index').apply(lambda x: x.sample(n=2)).reset_index(drop=True)



In [None]:
whole_data = pd.concat([cross_query_sentence_macthed,cross_query_sentence_unmacthed_random], axis = 0)


In [None]:
whole_data.head(1)

In [None]:
data = whole_data[['sentence','query_list','recipe_match']].rename(columns = {'sentence':'document','query_list':'query','recipe_match':'label'}).to_dict(orient='records')

In [None]:
data[0]

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

# Define the model and tokenizer
model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # For binary classification with logits

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]['query']
        document = self.data[idx]['document']
        label = self.data[idx]['label']

        encoded = self.tokenizer(
            query,
            document,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)  # Labels should be float for BCEWithLogitsLoss
        }

# Load and preprocess your dataset
# data = [
#     {"query": "What is AI?", "document": "AI stands for artificial intelligence.", "label": 1},
#     {"query": "What is AI?", "document": "AI is unrelated to natural intelligence.", "label": 0}
# ]

train_dataset = CustomDataset(data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Training loop
def train(model, dataloader, optimizer, loss_fn, device):
    model.train()
    model.to(device)

    for epoch in range(3):  # Set the number of epochs
        epoch_loss = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device).unsqueeze(1)  # Match logits shape

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}: Loss = {epoch_loss / len(dataloader)}")

# Run training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)  # Ensure model is moved to the correct device
train(model, train_loader, optimizer, loss_fn, device)

# Save the model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
