In [3]:
!pip install transformers



In [4]:
!pip install peft



In [5]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

In [6]:

# -----------------------------
# LOAD & PREPARE DATA
# -----------------------------
catalog_df = pd.read_csv("/teamspace/studios/this_studio/Zepto Data Challenge_ Intent prediction - catalog.csv")
catalog_df = catalog_df.drop_duplicates(subset=["product_variant_id"])

labeled_df = pd.read_csv("/teamspace/studios/this_studio/Zepto Data Challenge_ Intent prediction - labelled_data.csv")

# Format L1 inputs
catalog_df["l1_input"] = catalog_df.apply(lambda row: f"L1: {row['product_name']} | {row['l1_name']}", axis=1)
labeled_df["l1_query"] = labeled_df["search_term"].apply(lambda x: f"L1: {x}")

# -----------------------------
# LOAD MODEL & TOKENIZER
# -----------------------------
model_path = "/teamspace/studios/this_studio/l123-lora-fine-tuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)

base_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-mpnet-base-v2", num_labels=2)
model = PeftModel.from_pretrained(base_model, model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()

# -----------------------------
# EMBEDDING FUNCTION
# -----------------------------
def generate_embeddings(text_list, tokenizer, model, batch_size=32, max_length=128):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(text_list), batch_size)):
            batch = text_list[i:i+batch_size]
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)

            outputs = model.base_model.base_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
                return_dict=True
            )

            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings, dim=0)

# -----------------------------
# GENERATE EMBEDDINGS
# -----------------------------
catalog_l1_inputs = catalog_df["l1_input"].tolist()
query_l1_inputs = labeled_df["l1_query"].tolist()

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
import ast
def parse_list_string(text):
    if pd.isna(text) or text == '':
        return []
    try:
        # Handle cases where the list is represented as a string
        return ast.literal_eval(text)
    except (ValueError, SyntaxError):
        # Try to extract strings between quotes using regex
        matches = re.findall(r'"([^"]*)"', text)
        if matches:
            return matches
        return []

In [25]:
labeled_df['l1_preds'] = labeled_df['l1_preds'].apply(parse_list_string)
labeled_df['l2_preds'] = labeled_df['l2_preds'].apply(parse_list_string)
labeled_df['l3_preds'] = labeled_df['l3_preds'].apply(parse_list_string)

In [26]:
# -----------------------------
# PROCESS L1 CATEGORIES
# -----------------------------
print("Processing L1 categories...")

# Extract all unique L1 categories from catalog
catalog_l1_df = catalog_df[['product_variant_id', 'product_name', 'l1_name']].copy()
catalog_l1_df.rename(columns={'l1_name': 'category'}, inplace=True)
catalog_l1_df['source'] = 'catalog'
catalog_l1_df['level'] = 'L1'

# Extract all L1 categories from labeled data
labeled_l1_rows = []
for idx, row in labeled_df.iterrows():
    search_term = row['search_term']
    for l1 in row['l1_preds']:
        labeled_l1_rows.append({
            'product_name': search_term,
            'category': l1,
            'source': 'labeled',
            'level': 'L1'
        })

labeled_l1_df = pd.DataFrame(labeled_l1_rows)

# Combine catalog and labeled L1 data
all_l1_df = pd.concat([
    catalog_l1_df[['product_name', 'category', 'source', 'level']],
    labeled_l1_df[['product_name', 'category', 'source', 'level']]
])

# Generate L1 embeddings
# Final format: "passage: L1 product_name | L1 category"
def format_l1_text(row):
    return f"passage: L1 {row['product_name']} | L1 {row['category']}"

all_l1_texts = all_l1_df.apply(format_l1_text, axis=1).tolist()



Processing L1 categories...


In [27]:
all_l1_df

Unnamed: 0,product_name,category,source,level
0,himalaya baby lotion,Baby Care,catalog,L1
1,himalaya baby lotion,Baby Care,catalog,L1
2,johnson's baby lotion,Baby Care,catalog,L1
3,cetaphil baby daily lotion,Baby Care,catalog,L1
4,"sebamed baby body lotion, for all skin types",Baby Care,catalog,L1
...,...,...,...,...
22310,cologne mask,Skincare,labeled,L1
22311,cologne mask,Apparel & Lifestyle,labeled,L1
22312,color fx,Makeup & Beauty,labeled,L1
22313,color fx,Fragrances & Grooming,labeled,L1


In [28]:
all_l1_df.to_csv("AllL1.csv")

In [29]:
# -----------------------------
# PROCESS L2 CATEGORIES
# -----------------------------
print("Processing L2 categories...")

# Extract all unique L2 categories from catalog
catalog_l2_df = catalog_df[['product_variant_id', 'product_name', 'l2_name']].copy()
catalog_l2_df.rename(columns={'l2_name': 'category'}, inplace=True)
catalog_l2_df['source'] = 'catalog'
catalog_l2_df['level'] = 'L2'

# Extract all L2 categories from labeled data
labeled_l2_rows = []
for idx, row in labeled_df.iterrows():
    search_term = row['search_term']
    for l2 in row['l2_preds']:
        labeled_l2_rows.append({
            'product_name': search_term,
            'category': l2,
            'source': 'labeled',
            'level': 'L2'
        })

labeled_l2_df = pd.DataFrame(labeled_l2_rows)

# Combine catalog and labeled L2 data
all_l2_df = pd.concat([
    catalog_l2_df[['product_name', 'category', 'source', 'level']],
    labeled_l2_df[['product_name', 'category', 'source', 'level']]
])

# Generate L2 embeddings
def format_l2_text(row):
    return f"passage: L2 {row['product_name']} | L2 {row['category']}"

all_l2_texts = all_l2_df.apply(format_l2_text, axis=1).tolist()

Processing L2 categories...


In [30]:
all_l2_df

Unnamed: 0,product_name,category,source,level
0,himalaya baby lotion,Baby Skin & Hair Care,catalog,L2
1,himalaya baby lotion,Baby Skin & Hair Care,catalog,L2
2,johnson's baby lotion,Baby Skin & Hair Care,catalog,L2
3,cetaphil baby daily lotion,Baby Skin & Hair Care,catalog,L2
4,"sebamed baby body lotion, for all skin types",Baby Skin & Hair Care,catalog,L2
...,...,...,...,...
25221,color fx,Nails,labeled,L2
25222,color fx,Gifting,labeled,L2
25223,color fx,Eye Makeup,labeled,L2
25224,colored a4 sheets,Crafts & Hobby,labeled,L2


In [37]:
all_l2_df.to_csv("AllL2.csv")

In [31]:
# -----------------------------
# PROCESS L3 CATEGORIES
# -----------------------------
print("Processing L3 categories...")

# Extract all unique L3 categories from catalog
catalog_l3_df = catalog_df[['product_variant_id', 'product_name', 'l3_name']].copy()
catalog_l3_df.rename(columns={'l3_name': 'category'}, inplace=True)
catalog_l3_df['source'] = 'catalog'
catalog_l3_df['level'] = 'L3'

# Extract all L3 categories from labeled data
labeled_l3_rows = []
for idx, row in labeled_df.iterrows():
    search_term = row['search_term']
    for l3 in row['l3_preds']:
        labeled_l3_rows.append({
            'product_name': search_term,
            'category': l3,
            'source': 'labeled',
            'level': 'L3'
        })

labeled_l3_df = pd.DataFrame(labeled_l3_rows)

# Combine catalog and labeled L3 data
all_l3_df = pd.concat([
    catalog_l3_df[['product_name', 'category', 'source', 'level']],
    labeled_l3_df[['product_name', 'category', 'source', 'level']]
])

# Generate L3 embeddings
def format_l3_text(row):
    return f"passage: L3{row['product_name']} | L3{row['category']}"

all_l3_texts = all_l3_df.apply(format_l3_text, axis=1).tolist()


Processing L3 categories...


In [32]:
all_l3_df

Unnamed: 0,product_name,category,source,level
0,himalaya baby lotion,Baby Lotion,catalog,L3
1,himalaya baby lotion,Baby Lotion,catalog,L3
2,johnson's baby lotion,Baby Lotion,catalog,L3
3,cetaphil baby daily lotion,Baby Lotion,catalog,L3
4,"sebamed baby body lotion, for all skin types",Baby Lotion,catalog,L3
...,...,...,...,...
32936,color fx,Cuticle Scrub,labeled,L3
32937,color fx,Eye Liner,labeled,L3
32938,color fx,Anti Nail Biting Polish,labeled,L3
32939,colored a4 sheets,Craft paper,labeled,L3


In [36]:
all_l3_df.to_csv("AllL3.csv")

In [33]:

# -----------------------------
# LOAD & PREPARE DATA
# -----------------------------
catalog_df = pd.read_csv("/teamspace/studios/this_studio/Zepto Data Challenge_ Intent prediction - catalog.csv")
catalog_df = catalog_df.drop_duplicates(subset=["product_variant_id"])

labeled_df = pd.read_csv("/teamspace/studios/this_studio/Zepto Data Challenge_ Intent prediction - labelled_data.csv")

# Format L1 inputs
catalog_df["l1_input"] = catalog_df.apply(lambda row: f"L1: {row['product_name']} | {row['l1_name']}", axis=1)
labeled_df["l1_query"] = labeled_df["search_term"].apply(lambda x: f"L1: {x}")

# -----------------------------
# LOAD MODEL & TOKENIZER
# -----------------------------
model_path = "/teamspace/studios/this_studio/l123-lora-fine-tuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)

base_model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-mpnet-base-v2", num_labels=2)
model = PeftModel.from_pretrained(base_model, model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device).eval()


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
def generate_embeddings(text_list, tokenizer, model, batch_size=32, max_length=128):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(text_list), batch_size)):
            batch = text_list[i:i+batch_size]
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            input_ids = encoded["input_ids"].to(device)
            attention_mask = encoded["attention_mask"].to(device)

            outputs = model.base_model.base_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
                return_dict=True
            )

            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings, dim=0)


In [35]:
# Assuming `format_l1_text`, `format_l2_text`, and `format_l3_text` are defined
# Generate the text lists (if not already done)
all_l1_texts = all_l1_df.apply(format_l1_text, axis=1).tolist()
all_l2_texts = all_l2_df.apply(format_l2_text, axis=1).tolist()
all_l3_texts = all_l3_df.apply(format_l3_text, axis=1).tolist()

# Generate embeddings for L1, L2, and L3 texts
l1_embeddings = generate_embeddings(all_l1_texts, tokenizer, model)
l2_embeddings = generate_embeddings(all_l2_texts, tokenizer, model)
l3_embeddings = generate_embeddings(all_l3_texts, tokenizer, model)

# Move embeddings to CPU after computation (if needed) and convert to NumPy arrays
l1_embeddings = l1_embeddings.cpu().numpy()
l2_embeddings = l2_embeddings.cpu().numpy()
l3_embeddings = l3_embeddings.cpu().numpy()

# Save embeddings if needed (for example, in a pandas DataFrame)
l1_df = pd.DataFrame(l1_embeddings)
l2_df = pd.DataFrame(l2_embeddings)
l3_df = pd.DataFrame(l3_embeddings)

# Save to CSV for later use
l1_df.to_csv("l1_embeddings.csv", index=False)
l2_df.to_csv("l2_embeddings.csv", index=False)
l3_df.to_csv("l3_embeddings.csv", index=False)

# If you need to perform further analysis, you can now load the CSVs


  1%|          | 10/976 [00:11<18:17,  1.14s/it]


KeyboardInterrupt: 

In [27]:
import numpy as np
np.save("all_l1_embeddings.npy", l1_embeddings)
np.save("all_l2_embeddings.npy", l2_embeddings)
np.save("all_l3_embeddings.npy", l3_embeddings)

print(f"Saved embeddings: L1={len(l1_embeddings)}, L2={len(l2_embeddings)}, L3={len(l3_embeddings)}")
print("Complete!")

Saved embeddings: L1=31207, L2=34118, L3=41833
Complete!
