In [None]:
!pip install faiss-cpu

import faiss




In [None]:
!pip uninstall transformers

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.11/dist-packages/transformers-4.51.3.dist-info/*
    /usr/local/lib/python3.11/dist-packages/transformers/*
Proceed (Y/n)? n


In [None]:
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load E5-large model
# model = SentenceTransformer('intfloat/e5-large')
# model = SentenceTransformer("moka-ai/m3e-base")
model = SentenceTransformer("all-mpnet-base-v2")

# model = SentenceTransformer("BAAI/bge-base-en-v1.5")
model = model.to(device)

# -----------------------------
# LOAD PRODUCT CATALOG
# -----------------------------
catalog_df = pd.read_csv("/content/drive/MyDrive/Zepto IDC Query Classification/Zepto Data Challenge_ Intent prediction - catalog.csv")
catalog_df = catalog_df.drop_duplicates(subset=["product_variant_id"])

# -----------------------------
# LOAD LABELED SEARCH DATASET
# -----------------------------
labeled_df = pd.read_csv("/content/drive/MyDrive/Zepto IDC Query Classification/Zepto Data Challenge_ Intent prediction - labelled_data.csv")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
labeled_df

Unnamed: 0,search_term,l1_preds,l2_preds,l3_preds
0,atta maggi,"[""Packaged Food""]","[""Noodles & Vermicelli""]","[""Instant Noodles"",""Bundle""]"
1,bombay shaving company,"[""Electronics & Appliances"",""Feminine Hygiene""...","[""Grooming"",""Hair Removal For Women"",""Shaving ...","[""Trimmer"",""Razor"",""Shaving Cream"",""Face Wash""..."
2,choco fills,"[""Biscuits"",""Sweet Cravings"",""Breakfast & Sauc...","[""Chocolates"",""Breakfast Cereals"",""Creamfills""...","[""Wafer Chocolates"",""Kids' Cereal"",""Centre Fil..."
3,chocos,"[""Biscuits"",""Sweet Cravings"",""Breakfast & Sauc...","[""Cookies"",""Pastries & Cakes"",""Breakfast Cerea...","[""Choco Pie"",""Kids' Cereal""]"
4,epigamia yogurt,"[""Cold Drinks & Juices"",""Dairy, Bread & Eggs""]","[""Milk Drinks"",""Yogurts & Dahi""]","[""Smoothie"",""Dahi"",""Yogurts""]"
...,...,...,...,...
10830,colgate small toothpaste,"[""Bath & Body""]","[""Toothpaste & Mouthwash""]","[""Toothpaste""]"
10831,colgate whitening toothpaste,"[""Bath & Body""]","[""Toothpaste & Mouthwash""]","[""Toothpaste""]"
10832,cologne mask,"[""Fragrances & Grooming"",""Skincare"",""Apparel &...","[""Perfumes"",""Face Care"",""Travel Accessories"",""...","[""Body Perfume"",""Neck Pillow"",""Sheet Mask"",""Un..."
10833,color fx,"[""Makeup & Beauty"",""Fragrances & Grooming""]","[""Face Makeup"",""Hygiene Essentials"",""Nails"",""G...","[""Nail Polish Remover"",""Makeup Remover"",""Nail ..."


In [None]:
import pandas as pd
import ast

In [None]:
def flatten_for_finetuning(df):
    rows = []
    for _, row in df.iterrows():
        query = row['search_term']

        for l1 in ast.literal_eval(row['l1_preds']):
            rows.append({'input_text': f"L1: {query}", 'target_text': l1})

        for l2 in ast.literal_eval(row['l2_preds']):
            rows.append({'input_text': f"L2: {query}", 'target_text': l2})

        for l3 in ast.literal_eval(row['l3_preds']):
            rows.append({'input_text': f"L3: {query}", 'target_text': l3})

    return pd.DataFrame(rows)

finetune_df = flatten_for_finetuning(labeled_df)
finetune_df.to_csv("finetune_input.csv", index=False)
print(finetune_df.head())


                   input_text               target_text
0              L1: atta maggi             Packaged Food
1              L2: atta maggi      Noodles & Vermicelli
2              L3: atta maggi           Instant Noodles
3              L3: atta maggi                    Bundle
4  L1: bombay shaving company  Electronics & Appliances


In [None]:
from random import sample

def make_pair_samples(df, level, num_negatives=2):
    df = df[df['input_text'].str.startswith(level + ":")].copy()
    df['query'] = df['input_text'].apply(lambda x: x.split(":")[1].strip())

    # Group all valid categories per query
    group = df.groupby('query')['target_text'].apply(list).to_dict()

    all_categories = set(df['target_text'].unique())
    pair_data = []

    for query, pos_categories in group.items():
        # Positive samples
        for pos_cat in pos_categories:
            pair_data.append({
                'query': query,
                'category': pos_cat,
                'label': 1.0
            })

        # Negative samples
        negatives = list(all_categories - set(pos_categories))
        sampled_negs = sample(negatives, min(num_negatives * len(pos_categories), len(negatives)))
        for neg_cat in sampled_negs:
            pair_data.append({
                'query': query,
                'category': neg_cat,
                'label': 0.0
            })

    return pair_data


In [None]:
l1_samples = make_pair_samples(finetune_df, level="L1", num_negatives=2)
l1_samples[:6]  # show a few sample pairs

[{'query': '0 calorie drink',
  'category': 'Cold Drinks & Juices',
  'label': 1.0},
 {'query': '0 calorie drink',
  'category': 'Fruits  & Vegetables',
  'label': 0.0},
 {'query': '0 calorie drink', 'category': 'Baby Care', 'label': 0.0},
 {'query': '0aneer', 'category': 'Dairy, Bread & Eggs', 'label': 1.0},
 {'query': '0aneer', 'category': 'Pharma & Wellness', 'label': 0.0},
 {'query': '0aneer', 'category': 'Sexual Wellness', 'label': 0.0}]

In [None]:
l1_samples

[{'query': '0 calorie drink',
  'category': 'Cold Drinks & Juices',
  'label': 1.0},
 {'query': '0 calorie drink',
  'category': 'Fruits  & Vegetables',
  'label': 0.0},
 {'query': '0 calorie drink', 'category': 'Baby Care', 'label': 0.0},
 {'query': '0aneer', 'category': 'Dairy, Bread & Eggs', 'label': 1.0},
 {'query': '0aneer', 'category': 'Pharma & Wellness', 'label': 0.0},
 {'query': '0aneer', 'category': 'Sexual Wellness', 'label': 0.0},
 {'query': '0ni', 'category': 'Zepto Cafe', 'label': 1.0},
 {'query': '0ni', 'category': 'Fruits & Vegetables', 'label': 1.0},
 {'query': '0ni', 'category': 'Ice Creams & More', 'label': 0.0},
 {'query': '0ni', 'category': 'Hair Care', 'label': 0.0},
 {'query': '0ni', 'category': 'Electronics & Appliances', 'label': 0.0},
 {'query': '0ni', 'category': 'Stationery & Crafts', 'label': 0.0},
 {'query': '1 inch tape', 'category': 'Stationery & Books', 'label': 1.0},
 {'query': '1 inch tape', 'category': 'Pharma & Wellness', 'label': 1.0},
 {'query': '

In [None]:
!pip install datasets



useless


In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load tokenizer
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Your list of L1 pairs (output from make_pair_samples)
# Example:
# l1_pairs = [{'query': 'atta maggi', 'category': 'Packaged Food', 'label': 1.0}, ...]

def tokenize_example(example):
    return tokenizer(
        example['query'],
        example['category'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

hf_dataset = Dataset.from_list(l1_samples)
hf_dataset = hf_dataset.map(tokenize_example, batched=False)


Map:   0%|          | 0/66825 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

In [None]:


# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # adjust depending on the architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config)


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./l1-lora-checkpoints",
    per_device_train_batch_size=32,
    learning_rate=2e-4,
    num_train_epochs=3,
    eval_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: Target size (torch.Size([32])) must be the same as input size (torch.Size([32, 2]))

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Load tokenizer
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming l1_samples is already defined
# Example: l1_samples = [{'query': '0 calorie drink', 'category': 'Cold Drinks & Juices', 'label': 1.0}, ...]

def tokenize_example(example):
    tokenized = tokenizer(
        example['query'],
        example['category'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

    # Add the label to the tokenized output
    tokenized['labels'] = int(example['label'])  # Convert float to int for binary classification
    return tokenized

# Create dataset
hf_dataset = Dataset.from_list(l1_samples)
hf_dataset = hf_dataset.map(tokenize_example, batched=False)

Map:   0%|          | 0/66825 [00:00<?, ? examples/s]

In [None]:
# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # adjust depending on the architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config)

# Create training arguments with corrected parameter name
training_args = TrainingArguments(
    output_dir="./l1-lora-checkpoints",
    per_device_train_batch_size=32,
    learning_rate=2e-4,
    num_train_epochs=3,
    eval_strategy="no",  # Corrected from eval_strategy
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"
)

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # For binary classification with num_labels=2
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./l1-lora-fine-tuned")

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: CustomTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

useful

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Load tokenizer
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Assuming l1_samples is already defined
# Example: l1_samples = [{'query': '0 calorie drink', 'category': 'Cold Drinks & Juices', 'label': 1.0}, ...]

def tokenize_example(example):
    tokenized = tokenizer(
        example['query'],
        example['category'],
        truncation=True,
        padding='max_length',
        max_length=128
    )

    # Add the label to the tokenized output
    tokenized['labels'] = int(example['label'])  # Convert float to int for binary classification
    return tokenized

# Create dataset
hf_dataset = Dataset.from_list(l1_samples)
hf_dataset = hf_dataset.map(tokenize_example, batched=False)

In [None]:


# Load base model
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],  # adjust depending on the architecture
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(base_model, lora_config)

# Create training arguments with corrected parameter name
training_args = TrainingArguments(
    output_dir="./l1-lora-checkpoints",
    per_device_train_batch_size=32,
    learning_rate=2e-4,
    num_train_epochs=3,
    eval_strategy="no",  # Corrected from eval_strategy
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"
)

# Fixed CustomTrainer - added support for num_items_in_batch parameter
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # For binary classification with num_labels=2
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# Use the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Zepto IDC Query Classification/l1-lora-fine-tuned")

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,0.5187
1000,0.4505
1500,0.4284
2000,0.4191
2500,0.4161
3000,0.4041
3500,0.3962
4000,0.3857
4500,0.3821
5000,0.3818


In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

# Step 1: Load base model and tokenizer
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Step 2: Load LoRA adapter into the base model
lora_model_path = "/content/drive/MyDrive/Zepto IDC Query Classification/l1-lora-fine-tuned"
model = PeftModel.from_pretrained(base_model, lora_model_path)
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): MPNetForSequenceClassification(
      (mpnet): MPNetModel(
        (embeddings): MPNetEmbeddings(
          (word_embeddings): Embedding(30527, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): MPNetEncoder(
          (layer): ModuleList(
            (0-11): 12 x MPNetLayer(
              (attention): MPNetAttention(
                (attn): MPNetSelfAttention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_feature

In [None]:
import pandas as pd

In [None]:
catalog_df = pd.read_csv("/content/drive/MyDrive/Zepto IDC Query Classification/Zepto Data Challenge_ Intent prediction - catalog.csv")
catalog_df = catalog_df.drop_duplicates(subset=["product_variant_id"])


In [None]:
l1_df = catalog_df[['product_name', 'l1_name']].dropna().drop_duplicates()

# Combine product and category context
l1_inputs = [f"L1: {product} | {cat}" for product, cat in zip(l1_df['product_name'], l1_df['l1_name'])]


In [None]:
l1_categories

['Baby Care',
 'Kitchen & Dining',
 'Frozen Food',
 'Stationery & Books',
 'Fruits & Vegetables',
 'Cleaning Essentials',
 'Dairy, Bread & Eggs',
 'Packaged Food',
 'Munchies',
 'Meats, Fish & Eggs',
 'Zepto Cafe',
 'Biscuits',
 'Bath & Body',
 'Tea, Coffee & More',
 'Fragrances & Grooming',
 'Cold Drinks & Juices',
 'Paan Corner']

In [None]:
l1_inputs

['L1: himalaya baby lotion | Baby Care',
 "L1: johnson's baby lotion | Baby Care",
 'L1: cetaphil baby daily lotion | Baby Care',
 'L1: sebamed baby body lotion, for all skin types | Baby Care',
 'L1: himalaya baby lotion with pure cow ghee & ph 5.5 | Baby Care',
 'L1: parachute advansed baby lotion for new born babies virgin coconut oil & coconut milk | Baby Care',
 "L1: johnson's baby lotion bmr ecom | Baby Care",
 'L1: tedibar family atogla lotion for baby | Baby Care',
 'L1: avn baby daily moisture lotion | Baby Care',
 'L1: vesta homes round wooden 360 degree rotating cake stand, dessert platter|dia-10.5 inch, h- 3 inch | Kitchen & Dining',
 'L1: meatigo frozen chicken boneless breast | Frozen Food',
 'L1: meatzza fresh boneless chicken breast | Frozen Food',
 'L1: sneha supercool fz breast boneless | Frozen Food',
 'L1: meatigo everyday herb & lime chicken breast (frozen) | Frozen Food',
 'L1: measuring spoon | Kitchen & Dining',
 'L1: anjali measuring spoon | Kitchen & Dining',


In [None]:
import torch
from tqdm import tqdm

def generate_embeddings(text_list, tokenizer, model, batch_size=32, max_length=128):
    embeddings = []
    model.eval()

    with torch.no_grad():
        for i in tqdm(range(0, len(text_list), batch_size)):
            batch_texts = text_list[i:i+batch_size]
            encoded = tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=max_length
            )

            input_ids = encoded["input_ids"].to(model.device)
            attention_mask = encoded["attention_mask"].to(model.device)

            outputs = model.base_model.base_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
                return_dict=True
            )
            cls_embeddings = outputs.last_hidden_state[:, 0, :]


            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
            embeddings.append(cls_embeddings.cpu())

    return torch.cat(embeddings, dim=0)


In [None]:
l1_embeddings = generate_embeddings(
    text_list=l1_inputs,
    tokenizer=tokenizer,
    model=model,
    batch_size=32
)


100%|██████████| 242/242 [15:45<00:00,  3.91s/it]


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import faiss
import numpy as np

# Normalize for cosine similarity
l1_embeddings_np = l1_embeddings.numpy()
# l1_embeddings_np= np.load("/content/drive/MyDrive/Zepto IDC Query Classification/l1_from_fine_tuned.npy")
l1_embeddings_np = l1_embeddings_np / np.linalg.norm(l1_embeddings_np, axis=1, keepdims=True)

# Build FAISS index
index = faiss.IndexFlatIP(l1_embeddings_np.shape[1])
index.add(l1_embeddings_np)


In [None]:
l1_

In [None]:
def retrieve_top_k(query, k=10):
    formatted = f"L1: {query}"
    query_emb = generate_embeddings([formatted], tokenizer, model)
    query_emb = query_emb / query_emb.norm(dim=1, keepdim=True)  # Normalize

    D, I = index.search(query_emb.numpy(), k)  # FAISS expects np
    results = [(l1_inputs[i], float(D[0][rank])) for rank, i in enumerate(I[0])]
    return results


In [None]:
import pandas as pd

df = pd.DataFrame(l1_inputs)
df.to_csv("/content/drive/MyDrive/Zepto IDC Query Classification/l1_inputs.csv", index=False)


In [None]:
query = "sri sri ghee"
top_results = retrieve_top_k(query, k=10)

print(f"\nTop results for: '{query}'")
for score_pair in top_results:
    print(f"[{score_pair[1]:.4f}] {score_pair[0]}")


100%|██████████| 1/1 [00:00<00:00,  6.19it/s]


Top results for: 'sri sri ghee'
[0.7246] L1: namaskaram bhavnagri gathiya | Munchies
[0.7197] L1: veerabhadra green batana | Munchies
[0.7195] L1: namaskaram surti gathiya | Munchies
[0.7132] L1: namaskaram roasted yellow chana | Munchies
[0.7125] L1: veerabhadra namkeen | Munchies
[0.7099] L1: veerabhadra chakodi | Munchies
[0.7097] L1: too yumm! aloo bhujia | no palm oil | 35% less saturated fat | Munchies
[0.7072] L1: gopal namkeen dal moth | Munchies
[0.7071] L1: namaskaram masala chana dal | Munchies
[0.7058] L1: gwalia bhatha kani | Munchies



