In [3]:
# ONE-TIME: install dependencies (uncomment and run if needed)
# !pip install -q sentence-transformers transformers accelerate sklearn torch torchvision tqdm joblib

# -------------------------
# Full script: Qwen-0.6B embeddings + multimodal MLP training
# -------------------------
import os
from pathlib import Path
import time
import random
import math

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

from sentence_transformers import SentenceTransformer

# -------------------------
# 0. Paths & config (EDIT as needed)
# -------------------------
CSV_PATH = "/content/drive/MyDrive/AMAZON25/final_updated_csv_with_value_unit_150.csv"
IMG_EMB_PATH = "/content/drive/MyDrive/AMAZON25/train_embeddings_marqoB.npz"
#OUT_DIR = "/content/drive/MyDrive/AMAZON25/multimodal_model_qwen06"
#OUT_DIR.mkdir(parents=True, exist_ok=True)

# explicit drive copy folder for embeddings (guaranteed save location)
#DRIVE_EMB_DIR = "/content/drive/MyDrive/AMAZON25/embeddings_qwen06"
#DRIVE_EMB_DIR.mkdir(parents=True, exist_ok=True)

# Choose model id (Qwen 0.6B)
# QWEN_SMALL_ID = "Qwen/Qwen3-Embedding-B"   # change if you have a different small Qwen id
# FALLBACK_MODEL = "sentence-transformers/all-mpnet-base-v2"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)
print("Torch version:", torch.__version__)

# Embedding / model hyperparams
ENCODE_BATCH = 64      # change down if OOM
PROJ_DIM = 256
CAT_EMB_DIM = 32
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

Device: cuda
Torch version: 2.8.0+cu126


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# -------------------------
# 1. Load CSV & clean
# -------------------------
df = pd.read_csv(CSV_PATH)
print("CSV columns:", df.columns.tolist())

# required
if "item_name" not in df.columns or "price" not in df.columns:
    raise ValueError("CSV must contain 'item_name' and 'price' columns")

df = df.dropna(subset=["item_name", "price"]).reset_index(drop=True)
df["item_name"] = df["item_name"].astype(str).str.strip()
df["bullet_points"] = df.get("bullet_points", "").astype(str)
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df = df.dropna(subset=["price"]).reset_index(drop=True)
df["log_price"] = np.log1p(df["price"].clip(lower=0.0))

if "sample_id" not in df.columns:
    df["sample_id"] = df.index.astype(str)
df["sample_id"] = df["sample_id"].astype(str)

if "value" not in df.columns:
    df["value"] = 0.0
if "unit" not in df.columns:
    df["unit"] = "UNKNOWN"

# -------------------------
# 2. Load & inspect image embeddings (.npz)
# -------------------------
npz = np.load(IMG_EMB_PATH, allow_pickle=True)
print("NPZ keys:", list(npz.keys()))

# find main 2D embeddings array
img_emb_arr = None
candidate_key = None
for k in npz.files:
    a = npz[k]
    if isinstance(a, np.ndarray) and a.ndim == 2:
        img_emb_arr = a
        candidate_key = k
        break
if img_emb_arr is None:
    raise ValueError("Could not find a 2D array of embeddings inside the .npz file. Inspect the keys.")

print(f"Using '{candidate_key}' as image embeddings. shape={img_emb_arr.shape}")
IMAGE_DIM = img_emb_arr.shape[1]

# detect sample ids array inside npz (optional)
img_sample_ids = None
for k in npz.files:
    a = npz[k]
    if isinstance(a, np.ndarray) and a.ndim == 1 and a.shape[0] == img_emb_arr.shape[0]:
        if a.dtype.type is np.str_ or np.issubdtype(a.dtype, np.integer):
            img_sample_ids = a.astype(str).tolist()
            print("Found sample ids key in npz:", k)
            break

if img_sample_ids is None:
    if img_emb_arr.shape[0] == len(df):
        print("No sample_ids in npz but row count equals CSV length — assuming same order.")
        img_sample_ids = df["sample_id"].astype(str).tolist()
    else:
        print("WARNING: image embeddings length != CSV rows. You must ensure mapping correctness.")
        img_sample_ids = [str(i) for i in range(img_emb_arr.shape[0])]

sampleid_to_img_idx = {sid: i for i, sid in enumerate(img_sample_ids)}


CSV columns: ['sample_id', 'item_name', 'bullet_points', 'value_unit', 'value', 'unit', 'image_link', 'price']
NPZ keys: ['embeddings', 'sample_ids']
Using 'embeddings' as image embeddings. shape=(74328, 768)
Found sample ids key in npz: sample_ids


In [4]:
# Run in Colab (install once)
# !pip install -q transformers accelerate datasets peft sentence-transformers safetensors

import os
import time
import math
import random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, PeftModel

# ----------------- CONFIG -----------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

MODEL_ID = "Qwen/Qwen3-Embedding-4B"   # embedding model
CSV_PATH = "/content/drive/MyDrive/AMAZON25/final_updated_csv_with_value_unit_150.csv"  # your train CSV
OUT_DIR = Path("/content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned")
OUT_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 64        # adjust to your GPU mem
MAX_LEN = 48
EPOCHS = 3
LR_ENCODER = 2e-5
LR_HEAD = 1e-3
WEIGHT_DECAY = 1e-6
WARMUP_STEPS = 200

# LoRA config
LORA_R = 8
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
# target modules: many Qwen-like models use 'q_proj','k_proj','v_proj','o_proj' or 'dense'
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "dense", "qkv_proj", "query_key_value"]

# ----------------- Load data -----------------
df = pd.read_csv(CSV_PATH)
df = df.dropna(subset=["item_name", "price"]).reset_index(drop=True)
df["item_name"] = df["item_name"].astype(str).str.strip()
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df = df.dropna(subset=["price"]).reset_index(drop=True)
df["log_price"] = np.log1p(df["price"].clip(lower=0.0))

# train/val split
from sklearn.model_selection import train_test_split
price_q = pd.qcut(df["log_price"], q=10, duplicates="drop")
train_df, val_df = train_test_split(df, test_size=0.1, random_state=SEED, stratify=price_q)
print("sizes:", len(train_df), len(val_df))

# use item_name only for fine-tuning
train_texts = train_df["item_name"].astype(str).tolist()
train_labels = train_df["log_price"].astype(np.float32).values
val_texts = val_df["item_name"].astype(str).tolist()
val_labels = val_df["log_price"].astype(np.float32).values

# ----------------- Tokenizer & Base Model (encoder) -----------------
print("Loading tokenizer & base encoder:", MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
encoder = AutoModel.from_pretrained(MODEL_ID, trust_remote_code=True).to(DEVICE)

# IMPORTANT: disable cache (prevents some generation-related calls)
if hasattr(encoder.config, "use_cache"):
    encoder.config.use_cache = False

hidden_size = encoder.config.hidden_size
print("Hidden size:", hidden_size)

# ----------------- Wrap encoder with LoRA (PEFT) -----------------
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="FEATURE_EXTRACTION",  # embedding usage
)
encoder = get_peft_model(encoder, lora_config)
encoder.print_trainable_parameters()

# ----------------- Regression head wrapper -----------------
class RegModel(nn.Module):
    def __init__(self, encoder, hidden_size):
        super().__init__()
        self.encoder = encoder
        self.head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, 1)
        )
    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        cls = out.last_hidden_state[:, 0, :]        # CLS pooling
        cls = nn.functional.normalize(cls, p=2, dim=1)
        pred = self.head(cls).squeeze(1)
        return pred, cls

model = RegModel(encoder, hidden_size).to(DEVICE)

# Only LoRA adapters + head should be trainable. Confirm:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable} / {total}")

# ----------------- Dataset & DataLoader -----------------
class NamePriceDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=MAX_LEN):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        tok = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in tok.items()}
        item["label"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

train_ds = NamePriceDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_ds = NamePriceDataset(val_texts, val_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ----------------- Optimizer & scheduler -----------------
# separate groups: encoder params (LoRA) with small lr, head with larger lr
encoder_params = [p for n,p in model.named_parameters() if not n.startswith("head.")]
head_params = [p for n,p in model.named_parameters() if n.startswith("head.")]
optimizer = AdamW([
    {"params": encoder_params, "lr": LR_ENCODER, "weight_decay": WEIGHT_DECAY},
    {"params": head_params, "lr": LR_HEAD, "weight_decay": WEIGHT_DECAY}
])
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)
loss_fn = nn.MSELoss()

# ----------------- Training loop -----------------
# ----------------- Checkpointing setup (put before training loop) -----------------
ckpt_dir = OUT_DIR / "checkpoints"
ckpt_dir.mkdir(parents=True, exist_ok=True)
ckpt_latest = ckpt_dir / "checkpoint_latest.pt"

# Helper save function (atomic)
def save_checkpoint(path, epoch, model, optimizer, scheduler, best_smape):
    tmp = str(path) + ".tmp"
    ckpt = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),   # includes LoRA adapter params + head
        "optimizer_state_dict": optimizer.state_dict(),
        "scheduler_state_dict": scheduler.state_dict() if scheduler is not None else None,
        "best_smape": best_smape,
    }
    torch.save(ckpt, tmp)
    os.replace(tmp, str(path))

# Resume if checkpoint exists
start_epoch = 0
if ckpt_latest.exists():
    print("Found checkpoint, loading:", ckpt_latest)
    ckpt = torch.load(str(ckpt_latest), map_location=DEVICE)
    try:
        model.load_state_dict(ckpt["model_state_dict"], strict=False)
    except Exception as e:
        print("Warning: model.load_state_dict strict=False failed -> trying strict=True then fallback", e)
        model.load_state_dict(ckpt["model_state_dict"], strict=True)
    optimizer.load_state_dict(ckpt["optimizer_state_dict"])
    if ckpt.get("scheduler_state_dict") is not None and scheduler is not None:
        try:
            scheduler.load_state_dict(ckpt["scheduler_state_dict"])
        except Exception as e:
            print("Warning: failed to load scheduler state:", e)
    start_epoch = ckpt.get("epoch", 0) + 1
    best_smape = ckpt.get("best_smape", 1e9)
    print(f"Resuming from epoch {start_epoch} (best_smape={best_smape:.4f})")
else:
    best_smape = 1e9
    start_epoch = 0
    print("No checkpoint found. Starting fresh training.")

# ----------------- Training loop with checkpointing -----------------
for epoch in range(start_epoch, EPOCHS):
    model.train()
    tot_loss = 0.0
    loop = tqdm(train_loader, desc=f"Train {epoch+1}/{EPOCHS}")
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        preds, _ = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(preds, labels)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        tot_loss += loss.item() * input_ids.size(0)
        loop.set_postfix(loss=loss.item())

    avg_train_loss = tot_loss / len(train_ds)

    # ---------- validation ----------
    model.eval()
    all_preds, all_trues = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            preds, _ = model(input_ids=input_ids, attention_mask=attention_mask)
            all_preds.append(preds.cpu().numpy())
            all_trues.append(labels.cpu().numpy())

    preds_arr = np.concatenate(all_preds)
    trues_arr = np.concatenate(all_trues)
    preds_price = np.expm1(preds_arr)
    trues_price = np.expm1(trues_arr)
    smape = np.mean(2.0 * np.abs(preds_price - trues_price) / (np.abs(preds_price) + np.abs(trues_price) + 1e-8)) * 100.0

    print(f"Epoch {epoch+1} train_loss={avg_train_loss:.6f} val_smape={smape:.4f}")

    # ---------- Save checkpoint after every epoch ----------
    epoch_ckpt = ckpt_dir / f"checkpoint_epoch_{epoch+1}.pt"
    try:
        save_checkpoint(epoch_ckpt, epoch, model, optimizer, scheduler, best_smape)
        save_checkpoint(ckpt_latest, epoch, model, optimizer, scheduler, best_smape)
        print(f"Saved epoch checkpoint: {epoch_ckpt} and latest -> {ckpt_latest}")
    except Exception as e:
        print("Failed to save checkpoint:", e)

    # ---------- Save best model (PEFT adapters + head + tokenizer) ----------
    if smape < best_smape:
        best_smape = smape
        # Save PEFT adapter (encoder with adapters)
        peft_out = OUT_DIR / "peft_adapter"
        peft_out.mkdir(parents=True, exist_ok=True)
        # model.encoder is PeftModel/PEFT-wrapped - save adapters via save_pretrained
        try:
            model.encoder.save_pretrained(peft_out)
        except Exception as e:
            print("Warning saving peft adapters via model.encoder.save_pretrained failed:", e)
            # fallback: try saving the whole model state dict to disk (smaller than full base)
            torch.save(model.state_dict(), OUT_DIR / "model_state_dict_best.pth")
        # Save head params
        torch.save(model.head.state_dict(), OUT_DIR / "head.pth")
        # Save tokenizer
        tokenizer.save_pretrained(OUT_DIR / "tokenizer")
        print(f"✅ Saved best adapters + head to {OUT_DIR} (SMAPE={best_smape:.4f})")

print("Training finished. Best SMAPE:", best_smape)



sizes: 66888 7433
Loading tokenizer & base encoder: Qwen/Qwen3-Embedding-4B


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [5]:
# # -------------------------
# # 3+4. Load fine-tuned Qwen (PEFT) or fallback; encode & save name/bullet embeddings
# # # -------------------------
# # from transformers import AutoModel, AutoTokenizer
# # from peft import PeftModel
# # from sentence_transformers import SentenceTransformer
# # import torch
# # import numpy as np
# # import time
# # from pathlib import Path
# # from tqdm.auto import tqdm

# # --------- configurable vars (tune if needed) ----------
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ENCODE_BATCH = globals().get("ENCODE_BATCH", 1024)   # used for encoding batches of chunks
# SAVE_EVERY = 2000   # save partial results to drive every N items
# NAME_DEFAULT_MAX = 128
# BULLET_DEFAULT_MAX = 256
# CHUNK_SIZE_WORDS = 120   # when text too long, split by ~120 words per chunk
# QWEN_SMALL_ID = "Qwen/Qwen3-Embedding-4B"

# # Paths (adjust if you saved elsewhere)
# PEFT_DIR = Path("/content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/peft_adapter")
# TOKENIZER_DIR = Path("/content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/tokenizer")
# BASE = QWEN_SMALL_ID

# OUT_NAME_EMB = Path("/content/drive/MyDrive/AMAZON25/name_embeddings_qwen4.npy")
# OUT_BULLET_EMB = Path("/content/drive/MyDrive/AMAZON25/bullet_embeddings_qwen4.npy")
# OUT_NPZ = Path("/content/drive/MyDrive/AMAZON25/train_text_embs_qwen4.npz")

# print("Attempting to load finetuned encoder from:", PEFT_DIR)
# USE_PEFT = False
# encoder = None
# tokenizer = None
# qwen_model = None

# # Try load adapters + tokenizer first
# try:
#     if PEFT_DIR.exists() and TOKENIZER_DIR.exists():
#         print("Loading tokenizer from:", TOKENIZER_DIR)
#         tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR), trust_remote_code=True)
#         base = AutoModel.from_pretrained(BASE, trust_remote_code=True)
#         if hasattr(base.config, "use_cache"):
#             base.config.use_cache = False
#         encoder = PeftModel.from_pretrained(base, str(PEFT_DIR)).to(DEVICE)
#         encoder.eval()
#         USE_PEFT = True
#         print("Loaded PEFT adapter + base encoder.")
#     else:
#         print("PEFT adapter or tokenizer not found; will fall back to SentenceTransformer base.")
#         USE_PEFT = False
# except Exception as e:
#     print("Failed to load PEFT adapter/tokenizer:", e)
#     USE_PEFT = False

# if not USE_PEFT:
#     # fallback
#     print("Loading SentenceTransformer fallback:", QWEN_SMALL_ID)
#     qwen_model = SentenceTransformer(QWEN_SMALL_ID, device=DEVICE)
#     print("Fallback loaded, dim:", qwen_model.get_sentence_embedding_dimension())

# # ---- helpers ----
# def l2_normalize(a, eps=1e-12):
#     norms = np.linalg.norm(a, axis=1, keepdims=True)
#     norms[norms < eps] = 1.0
#     return a / norms

# def chunk_text_by_words(text, chunk_size=CHUNK_SIZE_WORDS):
#     words = text.split()
#     if len(words) <= chunk_size:
#         return [text]
#     chunks = []
#     for i in range(0, len(words), chunk_size):
#         chunk = " ".join(words[i:i+chunk_size])
#         chunks.append(chunk)
#     return chunks

# def embed_texts_peft_with_chunking(texts, tokenizer, model, batch_size=ENCODE_BATCH, max_length=128):
#     """Embed list of texts. If a text is longer than tokenizer.model_max_length, chunk and average."""
#     model.eval()
#     embs = []
#     model_max_len = getattr(tokenizer, "model_max_length", max_length)
#     max_len = min(max_length, model_max_len)
#     for text in tqdm(texts, desc="PEFT encode (chunking)", leave=False):
#         # if short -> encode directly
#         toks = tokenizer(text, truncation=False, padding=False, return_tensors=None)
#         approx_len = len(toks["input_ids"]) if "input_ids" in toks else len(text.split())
#         if approx_len <= max_len:
#             # direct encode
#             inputs = tokenizer([text], padding=True, truncation=True, max_length=max_len, return_tensors="pt")
#             inputs = {k: v.to(DEVICE) for k,v in inputs.items()}
#             with torch.no_grad():
#                 out = model(**inputs)
#                 cls = out.last_hidden_state[:,0,:]
#                 cls = torch.nn.functional.normalize(cls, p=2, dim=1)
#                 embs.append(cls.cpu().numpy()[0])
#         else:
#             # chunk by words and average chunk embeddings
#             chunks = chunk_text_by_words(text, chunk_size=CHUNK_SIZE_WORDS)
#             chunk_embs = []
#             # encode chunks in small batches
#             for i in range(0, len(chunks), batch_size):
#                 batch_chunks = chunks[i:i+batch_size]
#                 inputs = tokenizer(batch_chunks, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
#                 inputs = {k: v.to(DEVICE) for k,v in inputs.items()}
#                 with torch.no_grad():
#                     out = model(**inputs)
#                     c = out.last_hidden_state[:,0,:]   # (B, hidden)
#                     c = torch.nn.functional.normalize(c, p=2, dim=1)
#                     chunk_embs.append(c.cpu().numpy())
#             chunk_embs = np.vstack(chunk_embs)  # (num_chunks, dim)
#             avg = np.mean(chunk_embs, axis=0, keepdims=True)  # (1, dim)
#             # l2-normalize averaged vector (important)
#             avg = avg / (np.linalg.norm(avg, axis=1, keepdims=True) + 1e-12)
#             embs.append(avg[0])
#     return np.vstack(embs)

# def embed_texts_st_with_chunking(model_st, texts, batch_size=ENCODE_BATCH, max_length=128):
#     """Same behavior for SentenceTransformer: chunk long texts and average."""
#     model_st.eval()
#     embs = []
#     # sentence-transformers will handle truncation internally; but for long texts we chunk similarly
#     for text in tqdm(texts, desc="ST encode (chunking)", leave=False):
#         # use simple approx: if text has <= max_length words -> encode directly
#         words = text.split()
#         if len(words) <= CHUNK_SIZE_WORDS:
#             emb = model_st.encode([text], convert_to_numpy=True, show_progress_bar=False)[0]
#             embs.append(emb)
#         else:
#             chunks = chunk_text_by_words(text, chunk_size=CHUNK_SIZE_WORDS)
#             chunk_embs = []
#             for i in range(0, len(chunks), batch_size):
#                 batch_chunks = chunks[i:i+batch_size]
#                 ce = model_st.encode(batch_chunks, convert_to_numpy=True, show_progress_bar=False)
#                 chunk_embs.append(ce)
#             chunk_embs = np.vstack(chunk_embs)
#             avg = np.mean(chunk_embs, axis=0, keepdims=True)
#             avg = avg / (np.linalg.norm(avg, axis=1, keepdims=True) + 1e-12)
#             embs.append(avg[0])
#     return np.vstack(embs)

# # wrapper that matches old API
# def encode_in_batches_dummy(texts, batch_size=ENCODE_BATCH, max_length=128):
#     if USE_PEFT:
#         return embed_texts_peft_with_chunking(texts, tokenizer, encoder, batch_size=batch_size, max_length=max_length)
#     else:
#         return embed_texts_st_with_chunking(qwen_model, texts, batch_size=batch_size, max_length=max_length)

# # ---- Set dims ----
# if USE_PEFT:
#     NAME_DIM = encoder.config.hidden_size
# else:
#     NAME_DIM = qwen_model.get_sentence_embedding_dimension()
# BULLET_DIM = NAME_DIM
# print("NAME_DIM:", NAME_DIM, "BULLET_DIM:", BULLET_DIM)

# # ---- Load existing embeddings if present, otherwise compute & save incrementally ----
# if OUT_NAME_EMB.exists() and OUT_BULLET_EMB.exists():
#     print("Loading existing saved text embeddings from Drive.")
#     name_embs = np.load(OUT_NAME_EMB)
#     bullet_embs = np.load(OUT_BULLET_EMB)
#     print("Loaded shapes:", name_embs.shape, bullet_embs.shape)
# else:
#     names = df["item_name"].astype(str).tolist()
#     bullets = df["bullet_points"].astype(str).tolist()

#     # encode names (short texts)
#     print("Encoding item_name embeddings (with chunking fallback)...")
#     name_embs = encode_in_batches_dummy(names, batch_size=ENCODE_BATCH, max_length=min(NAME_DEFAULT_MAX, getattr(tokenizer, "model_max_length", NAME_DEFAULT_MAX)))
#     np.save(OUT_NAME_EMB, name_embs)
#     print("Saved item_name embeddings ->", OUT_NAME_EMB, "shape:", name_embs.shape)

#     # encode bullets (may be long)
#     print("Encoding bullet_points embeddings (may take longer due to chunking)...")
#     bullet_embs = encode_in_batches_dummy(bullets, batch_size=ENCODE_BATCH, max_length=min(BULLET_DEFAULT_MAX, getattr(tokenizer, "model_max_length", BULLET_DEFAULT_MAX)))
#     np.save(OUT_BULLET_EMB, bullet_embs)
#     print("Saved bullet embeddings ->", OUT_BULLET_EMB, "shape:", bullet_embs.shape)

#     # compressed container
#     np.savez_compressed(OUT_NPZ,
#                         sample_ids=np.array(df["sample_id"].astype(str)),
#                         name_embs=name_embs,
#                         bullet_embs=bullet_embs)
#     print("Saved compressed NPZ ->", OUT_NPZ)

# # final safe normalization
# name_embs = l2_normalize(np.array(name_embs, dtype=np.float32))
# bullet_embs = l2_normalize(np.array(bullet_embs, dtype=np.float32))
# print("Final shapes:", name_embs.shape, bullet_embs.shape)


Attempting to load finetuned encoder from: /content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/peft_adapter
Loading tokenizer from: /content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded PEFT adapter + base encoder.
NAME_DIM: 2560 BULLET_DIM: 2560
Encoding item_name embeddings (with chunking fallback)...


PEFT encode (chunking):   0%|          | 0/74321 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Cell: generate and save embeddings using finetuned PEFT adapter ONLY
# Run in Colab / notebook (make sure drive is mounted)

from pathlib import Path
import time
import numpy as np
from tqdm.auto import tqdm
import torch

# ---------- Config (edit only if needed) ----------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)
ENCODE_BATCH = 1024        # user requested batch
max_len_name = 128         # token length for item_name
max_len_bullet = 512       # token length for bullet_points

# Paths to finetuned artifacts (must exist)
PEFT_DIR = Path("/content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/peft_adapter")
TOKENIZER_DIR = Path("/content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/tokenizer")
BASE = "Qwen/Qwen3-Embedding-4B"

# Output files (user requested)
OUT_NAME_EMB = Path("/content/drive/MyDrive/AMAZON25/name_embeddings_qwen4.npy")
OUT_BULLET_EMB = Path("/content/drive/MyDrive/AMAZON25/bullet_embeddings_qwen4.npy")
OUT_NPZ = Path("/content/drive/MyDrive/AMAZON25/train_text_embs_qwen4.npz")

# Dataframe `df` must be present in the notebook environment (sample_id, item_name, bullet_points).
# If not present, set CSV_PATH and uncomment the load line below.
# CSV_PATH = "/content/drive/MyDrive/AMAZON25/final_updated_csv_with_value_unit_150.csv"
# import pandas as pd
# df = pd.read_csv(CSV_PATH)

# Safety checks
if not PEFT_DIR.exists():
    raise FileNotFoundError(f"PEFT adapter folder not found at {PEFT_DIR}. Aborting. (You asked to use finetuned model only.)")
if not TOKENIZER_DIR.exists():
    raise FileNotFoundError(f"Tokenizer folder not found at {TOKENIZER_DIR}. Aborting.")

if "df" not in globals():
    raise RuntimeError("DataFrame 'df' not found in environment. Load your test/train CSV into variable `df` before running this cell.")

# ---------- Load tokenizer + base + PEFT adapter ----------
from transformers import AutoTokenizer, AutoModel
from peft import PeftModel

print("Loading tokenizer from:", TOKENIZER_DIR)
tokenizer = AutoTokenizer.from_pretrained(str(TOKENIZER_DIR), trust_remote_code=True)

print("Loading base model (this may take a moment)...")
base = AutoModel.from_pretrained(BASE, trust_remote_code=True)
if hasattr(base.config, "use_cache"):
    base.config.use_cache = False

print("Wrapping base with PEFT adapter from:", PEFT_DIR)
encoder = PeftModel.from_pretrained(base, str(PEFT_DIR)).to(DEVICE)
encoder.eval()
print("PEFT encoder loaded. hidden_size =", encoder.config.hidden_size)

# ---------- Embedding helper (PEFT only, no fallback) ----------
import torch.nn.functional as F

def embed_texts_peft(texts, tokenizer, model, batch_size=ENCODE_BATCH, max_length=128):
    model.eval()
    all_embs = []
    n = len(texts)
    it = range(0, n, batch_size)
    pbar = tqdm(it, desc=f"Embedding ({n} texts)", unit="batch")
    for i in pbar:
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        with torch.no_grad():
            out = model(**inputs)
            cls = out.last_hidden_state[:, 0, :]           # (B, hidden)
            cls = F.normalize(cls, p=2, dim=1)             # L2 normalize
            all_embs.append(cls.cpu().numpy())
        pbar.set_postfix({'processed': min(i+batch_size, n)})
    if len(all_embs) == 0:
        return np.zeros((0, model.config.hidden_size), dtype=np.float32)
    return np.vstack(all_embs)

# ---------- Generate or load embeddings ----------
if OUT_NAME_EMB.exists() and OUT_BULLET_EMB.exists():
    print("Found existing embeddings. Loading from disk:")
    name_embs = np.load(OUT_NAME_EMB)
    bullet_embs = np.load(OUT_BULLET_EMB)
    print("Loaded shapes:", name_embs.shape, bullet_embs.shape)
else:
    names = df["item_name"].astype(str).tolist()
    bullets = df["bullet_points"].astype(str).tolist()

    t0 = time.time()
    print("Encoding item_name (max_length =", max_len_name, ") ...")
    name_embs = embed_texts_peft(names, tokenizer, encoder, batch_size=ENCODE_BATCH, max_length=max_len_name)
    print("Encoding bullet_points (max_length =", max_len_bullet, ") ...")
    bullet_embs = embed_texts_peft(bullets, tokenizer, encoder, batch_size=ENCODE_BATCH, max_length=max_len_bullet)

    # Defensive final normalization (should already be normalized)
    def l2_normalize(a, eps=1e-12):
        norms = np.linalg.norm(a, axis=1, keepdims=True)
        norms[norms < eps] = 1.0
        return a / norms

    name_embs = l2_normalize(name_embs.astype(np.float32))
    bullet_embs = l2_normalize(bullet_embs.astype(np.float32))

    # Save to Drive
    np.save(OUT_NAME_EMB, name_embs)
    np.save(OUT_BULLET_EMB, bullet_embs)
    np.savez_compressed(OUT_NPZ,
                       sample_ids=np.array(df["sample_id"].astype(str)),
                       name_embs=name_embs,
                       bullet_embs=bullet_embs)
    print(f"Saved embeddings to:\n - {OUT_NAME_EMB}\n - {OUT_BULLET_EMB}\n - {OUT_NPZ}")
    print("Time elapsed: %.1f s" % (time.time() - t0))

print("Final shapes: name_embs:", name_embs.shape, " bullet_embs:", bullet_embs.shape)
# ready — downstream cells can use name_embs, bullet_embs, and the saved files on Drive


Device: cuda
Loading tokenizer from: /content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/tokenizer
Loading base model (this may take a moment)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Wrapping base with PEFT adapter from: /content/drive/MyDrive/AMAZON25/qwen4b_lora_finetuned/peft_adapter
PEFT encoder loaded. hidden_size = 2560
Encoding item_name (max_length = 128 ) ...


Embedding (74321 texts):   0%|          | 0/73 [00:00<?, ?batch/s]

In [None]:
print("hi")

hi


In [None]:
#load the saved embeddings :
OUT_NAME_EMB = Path("/content/drive/MyDrive/AMAZON25/name_embeddings_qwen06.npy")
OUT_BULLET_EMB = Path("/content/drive/MyDrive/AMAZON25/bullet_embeddings_qwen06.npy")

name_embs = np.load(OUT_NAME_EMB)
bullet_embs = np.load(OUT_BULLET_EMB)

print("Embeddings loaded successfully:")
print("name_embs:", name_embs.shape)
print("bullet_embs:", bullet_embs.shape)


Embeddings loaded successfully:
name_embs: (74321, 1024)
bullet_embs: (74321, 1024)


In [None]:
# -------------------------
# 5. Align image embeddings to dataframe order
# -------------------------
def get_img_emb_for_sample(sid):
    idx = sampleid_to_img_idx.get(str(sid), None)
    if idx is None:
        return np.zeros(IMAGE_DIM, dtype=np.float32)
    return img_emb_arr[idx].astype(np.float32)

image_embs_ordered = np.vstack([get_img_emb_for_sample(sid) for sid in df["sample_id"].astype(str).tolist()])
print("Ordered image embeddings shape:", image_embs_ordered.shape)

Ordered image embeddings shape: (74321, 768)


step # 6 Prepare numeric & categorical inputs

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# Assuming df is already loaded and cleaned
df["value_num"] = pd.to_numeric(df.get("value", 0.0), errors="coerce").fillna(0.0)

# Encode unit column
units = df["unit"].fillna("UNKNOWN").astype(str).tolist()
le_unit = LabelEncoder()
unit_ids = le_unit.fit_transform(units)
num_unit_classes = len(le_unit.classes_)
print("num_unit_classes:", num_unit_classes)

# ✅ Save the label encoder directly to Drive (not to a folder)
joblib.dump(le_unit, "/content/drive/MyDrive/AMAZON25/unit_labelenc.joblib")

# Prepare log price target
y_log = df["log_price"].values.astype(np.float32)


num_unit_classes: 13


In [None]:
# 7. Train / val split (stratified)
# -------------------------
price_q = pd.qcut(df["log_price"], q=10, duplicates="drop")
train_idx, val_idx = train_test_split(np.arange(len(df)), test_size=0.10, random_state=SEED, stratify=price_q)
print("train / val sizes:", train_idx.shape[0], val_idx.shape[0])

# Build arrays
X_name = name_embs.astype(np.float32)
X_bullet = bullet_embs.astype(np.float32)
X_img = image_embs_ordered.astype(np.float32)
X_value = df["value_num"].astype(np.float32).values.reshape(-1, 1)
X_unit = unit_ids.astype(np.int64)

train / val sizes: 66888 7433


In [None]:
# -------------------------
# 8. Dataset & DataLoader
# -------------------------
class MultiModalDataset(Dataset):
    def __init__(self, idxs):
        self.idxs = idxs
    def __len__(self):
        return len(self.idxs)
    def __getitem__(self, i):
        idx = self.idxs[i]
        return {
            "name_emb": X_name[idx],
            "bullet_emb": X_bullet[idx],
            "img_emb": X_img[idx],
            "value": X_value[idx],
            "unit": X_unit[idx],
            "label": y_log[idx],
            "sample_id": df.iloc[idx]["sample_id"]
        }

BATCH_SIZE = 64
train_loader = DataLoader(MultiModalDataset(train_idx), batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(MultiModalDataset(val_idx), batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [None]:

NAME_DIM = name_embs.shape[1]
BULLET_DIM = bullet_embs.shape[1]
IMAGE_DIM = image_embs_ordered.shape[1]



In [None]:
# -------------------------
# 9. Model definition (learnable unit embedding included)
# -------------------------
class MultiModalRegressor(nn.Module):
    def __init__(self, name_dim, bullet_dim, img_dim, num_unit_classes, proj_dim=256, cat_emb_dim=32, dropout=0.3):
        super().__init__()
        # projections
        self.proj_name = nn.Sequential(nn.Linear(name_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_bullet = nn.Sequential(nn.Linear(bullet_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_img = nn.Sequential(nn.Linear(img_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        # learnable unit embedding
        self.unit_emb = nn.Embedding(num_unit_classes, cat_emb_dim)
        nn.init.xavier_uniform_(self.unit_emb.weight)
        # numeric value projector
        self.value_proj = nn.Sequential(nn.Linear(1, 32), nn.ReLU())
        # head
        total_dim = proj_dim * 3 + cat_emb_dim + 32
        self.head = nn.Sequential(
            nn.Linear(total_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout * 0.8),
            nn.Linear(128, 1)
        )
    def forward(self, name_emb, bullet_emb, img_emb, value, unit_ids):
        n = self.proj_name(name_emb)
        b = self.proj_bullet(bullet_emb)
        i = self.proj_img(img_emb)
        u = self.unit_emb(unit_ids)
        v = self.value_proj(value)
        x = torch.cat([n, b, i, u, v], dim=1)
        out = self.head(x)
        return out.squeeze(1)

model = MultiModalRegressor(NAME_DIM, BULLET_DIM, IMAGE_DIM, num_unit_classes, proj_dim=PROJ_DIM, cat_emb_dim=CAT_EMB_DIM, dropout=0.25).to(DEVICE)
print("Trainable params:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Trainable params: 1215969


In [None]:
# -------------------------
# 10. Training loop (MSE on log target) + SMAPE eval on price scale
# -------------------------
def compute_smape(y_true_price, y_pred_price):
    return np.mean(2.0 * np.abs(y_pred_price - y_true_price) / (np.abs(y_pred_price) + np.abs(y_true_price) + 1e-8)) * 100.0

optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2)
loss_fn = nn.MSELoss()

best_smape = 1e9
patience = 4
no_imp = 0
EPOCHS = 30

from tqdm.auto import tqdm

# loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for batch in loop:
        optimizer.zero_grad()
        name = torch.tensor(batch["name_emb"], dtype=torch.float32).to(DEVICE)
        bullet = torch.tensor(batch["bullet_emb"], dtype=torch.float32).to(DEVICE)
        img = torch.tensor(batch["img_emb"], dtype=torch.float32).to(DEVICE)
        value = torch.tensor(batch["value"], dtype=torch.float32).to(DEVICE)
        unit = torch.tensor(batch["unit"], dtype=torch.long).to(DEVICE)
        label = torch.tensor(batch["label"], dtype=torch.float32).to(DEVICE)
        preds_log = model(name, bullet, img, value, unit)
        loss = loss_fn(preds_log, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * name.size(0)
        loop.set_postfix(loss=loss.item())
    avg_train_loss = total_loss / len(train_idx)

    # validation
    model.eval()
    preds_log = []
    trues_log = []
    with torch.no_grad():
        for batch in val_loader:
            name = torch.tensor(batch["name_emb"], dtype=torch.float32).to(DEVICE)
            bullet = torch.tensor(batch["bullet_emb"], dtype=torch.float32).to(DEVICE)
            img = torch.tensor(batch["img_emb"], dtype=torch.float32).to(DEVICE)
            value = torch.tensor(batch["value"], dtype=torch.float32).to(DEVICE)
            unit = torch.tensor(batch["unit"], dtype=torch.long).to(DEVICE)
            label = torch.tensor(batch["label"], dtype=torch.float32).to(DEVICE)
            pred_log = model(name, bullet, img, value, unit)
            preds_log.append(pred_log.cpu().numpy())
            trues_log.append(label.cpu().numpy())
    preds_log = np.concatenate(preds_log)
    trues_log = np.concatenate(trues_log)

    preds_price = np.expm1(preds_log)
    trues_price = np.expm1(trues_log)
    smape = compute_smape(trues_price, preds_price)

    print(f"Epoch {epoch+1}/{EPOCHS} train_loss={avg_train_loss:.6f} val_smape={smape:.4f}")
    scheduler.step(smape)

    if smape < best_smape:
        best_smape = smape
        no_imp = 0
        torch.save(model.state_dict(), "/content/drive/MyDrive/AMAZON25/best_multimodal_mlp_qwen06.pth")
        print("Saved best model. SMAPE:", best_smape)
    else:
        no_imp += 1
        if no_imp >= patience:
            print("Early stopping at epoch", epoch+1)
            break

# # Save final artifacts
# joblib.dump(le_unit, OUT_DIR / "unit_labelenc.joblib")
# np.save(OUT_DIR / "name_embs_qwen06.npy", name_embs)
# np.save(OUT_DIR / "bullet_embs_qwen06.npy", bullet_embs)
# np.save(OUT_DIR / "image_embs_order.npy", image_embs_ordered)
# # Also keep copies in DRIVE_EMB_DIR for safety
# np.save(DRIVE_EMB_DIR / "name_embs_qwen06.npy", name_embs)
# np.save(DRIVE_EMB_DIR / "bullet_embs_qwen06.npy", bullet_embs)
# np.save(DRIVE_EMB_DIR / "image_embs_order.npy", image_embs_ordered)

print("Finished training. Best SMAPE:", best_smape)
# print("Artifacts saved to:", OUT_DIR)

Epoch 1/30:   0%|          | 0/1046 [00:00<?, ?it/s]

  name = torch.tensor(batch["name_emb"], dtype=torch.float32).to(DEVICE)
  bullet = torch.tensor(batch["bullet_emb"], dtype=torch.float32).to(DEVICE)
  img = torch.tensor(batch["img_emb"], dtype=torch.float32).to(DEVICE)
  value = torch.tensor(batch["value"], dtype=torch.float32).to(DEVICE)
  unit = torch.tensor(batch["unit"], dtype=torch.long).to(DEVICE)
  label = torch.tensor(batch["label"], dtype=torch.float32).to(DEVICE)
  name = torch.tensor(batch["name_emb"], dtype=torch.float32).to(DEVICE)
  bullet = torch.tensor(batch["bullet_emb"], dtype=torch.float32).to(DEVICE)
  img = torch.tensor(batch["img_emb"], dtype=torch.float32).to(DEVICE)
  value = torch.tensor(batch["value"], dtype=torch.float32).to(DEVICE)
  unit = torch.tensor(batch["unit"], dtype=torch.long).to(DEVICE)
  label = torch.tensor(batch["label"], dtype=torch.float32).to(DEVICE)


Epoch 1/30 train_loss=256.105949 val_smape=58.5427
Saved best model. SMAPE: 58.542675


Epoch 2/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 2/30 train_loss=14.361169 val_smape=59.7889


Epoch 3/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 3/30 train_loss=1.103273 val_smape=56.0555
Saved best model. SMAPE: 56.055504


Epoch 4/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 4/30 train_loss=0.744790 val_smape=56.9601


Epoch 5/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 5/30 train_loss=0.804649 val_smape=54.2487
Saved best model. SMAPE: 54.248714


Epoch 6/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 6/30 train_loss=0.595963 val_smape=55.6095


Epoch 7/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 7/30 train_loss=0.503842 val_smape=56.5472


Epoch 8/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 8/30 train_loss=0.461520 val_smape=55.0271


Epoch 9/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f819a1da5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    Exception ignored in: if w.is_alive():
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f819a1da5c0>  
 Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
       self._shutdown_workers() 
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^^    ^if w.is_alive():^
^ ^ ^ ^ ^ ^ ^ 
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^ ^ ^ ^ ^ ^ ^ 
    File "/usr

Epoch 9/30 train_loss=0.419105 val_smape=51.7414
Saved best model. SMAPE: 51.74142


Epoch 10/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 10/30 train_loss=0.366872 val_smape=51.0061
Saved best model. SMAPE: 51.006104


Epoch 11/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 11/30 train_loss=0.331774 val_smape=52.7652


Epoch 12/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 12/30 train_loss=0.306294 val_smape=51.1979


Epoch 13/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 13/30 train_loss=0.291963 val_smape=51.2058


Epoch 14/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 14/30 train_loss=0.236894 val_smape=50.3134
Saved best model. SMAPE: 50.313366


Epoch 15/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 15/30 train_loss=0.215709 val_smape=50.5878


Epoch 16/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 16/30 train_loss=0.198915 val_smape=50.4852


Epoch 17/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 17/30 train_loss=0.220894 val_smape=51.0492


Epoch 18/30:   0%|          | 0/1046 [00:00<?, ?it/s]

Epoch 18/30 train_loss=0.169601 val_smape=50.4359
Early stopping at epoch 18
Finished training. Best SMAPE: 50.313366


In [None]:
pip install open-clip-torch

Collecting open-clip-torch
  Downloading open_clip_torch-3.2.0-py3-none-any.whl.metadata (32 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading open_clip_torch-3.2.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open-clip-torch
Successfully installed ftfy-6.3.1 open-clip-torch-3.2.0


In [None]:
# ------------------ Setup ------------------

import os
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
import requests
import joblib
from torch.utils.data import DataLoader, Dataset

from sentence_transformers import SentenceTransformer
import open_clip

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# ------------------ Paths ------------------
TEST_CSV = "/content/drive/MyDrive/AMAZON25/test_cleaned1.csv"
MLP_PATH = "/content/drive/MyDrive/AMAZON25/best_multimodal_mlp_qwen06_49.pth"
UNIT_ENCODER_PATH = "/content/drive/MyDrive/AMAZON25/unit_labelenc.joblib"
IMAGE_DIR = "/content/drive/MyDrive/AMAZON25/test_images"
os.makedirs(IMAGE_DIR, exist_ok=True)

# ------------------ Load Test CSV ------------------
test_df = pd.read_csv(TEST_CSV)
test_df["item_name"] = test_df["item_name"].fillna("").astype(str)
test_df["bullet_points"] = test_df.get("bullet_points", "").fillna("").astype(str)
test_df["value"] = pd.to_numeric(test_df.get("value", 0.0), errors="coerce").fillna(0.0)


# Load your trained LabelEncoder
le_unit = joblib.load(UNIT_ENCODER_PATH)

# Fill missing and convert to str
test_df["unit"] = test_df["unit"].fillna("unknown").astype(str)

# Replace any unseen labels with "unknown"
test_df["unit"] = test_df["unit"].apply(lambda x: x if x in le_unit.classes_ else "unknown")

# Now transform safely
unit_ids = le_unit.transform(test_df["unit"].tolist())


# TEST_EMB_DIR = "/content/drive/MyDrive/AMAZON25"


# ------------------ Load Qwen Text Embedding Model ------------------
QWEN_SMALL_ID = "Qwen/Qwen3-Embedding-0.6B"
print("Loading Qwen small embedding model...")
qwen_model = SentenceTransformer(QWEN_SMALL_ID, device=DEVICE)
NAME_DIM = qwen_model.get_sentence_embedding_dimension()
BULLET_DIM = NAME_DIM
print("Text embedding dim:", NAME_DIM)

# ------------------ Helper function: L2-normalize ------------------
def l2_normalize(a, eps=1e-12):
    norms = np.linalg.norm(a, axis=1, keepdims=True)
    norms[norms < eps] = 1.0
    return a / norms

def encode_in_batches(model, texts, batch_size=128):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches"):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        all_embs.append(emb)
    return l2_normalize(np.vstack(all_embs))

# ------------------ Generate and Save item_name embeddings ------------------
# print("Generating item_name embeddings...")
# name_embs = encode_in_batches(qwen_model, test_df["item_name"].tolist(), batch_size=4096)
# name_emb_path = "/content/drive/MyDrive/AMAZON25/test_name_embs_qwen06.npy"
# np.save(name_emb_path, name_embs)
# print(f"Saved item_name embeddings to {name_emb_path} | shape={name_embs.shape}")

# # ------------------ Generate and Save bullet_points embeddings ------------------
# print("Generating bullet_points embeddings...")
# bullet_embs = encode_in_batches(qwen_model, test_df["bullet_points"].tolist(), batch_size=4096)
# bullet_emb_path = "/content/drive/MyDrive/AMAZON25/test_bullet_embs_qwen06.npy"
# np.save(bullet_emb_path, bullet_embs)
# print(f"Saved bullet_points embeddings to {bullet_emb_path} | shape={bullet_embs.shape}")

#loading the saved ones
# ------------------ Load pre-saved item_name embeddings ------------------
name_emb_path = "/content/drive/MyDrive/AMAZON25/test_name_embs_qwen06.npy"
print(f"Loading item_name embeddings from {name_emb_path}...")
name_embs = np.load(name_emb_path)
print(f"Loaded item_name embeddings | shape={name_embs.shape}")

# ------------------ Load pre-saved bullet_points embeddings ------------------
bullet_emb_path = "/content/drive/MyDrive/AMAZON25/test_bullet_embs_qwen06.npy"
print(f"Loading bullet_points embeddings from {bullet_emb_path}...")
bullet_embs = np.load(bullet_emb_path)
print(f"Loaded bullet_points embeddings | shape={bullet_embs.shape}")


# ------------------ Load Marqo/OpenCLIP Image Embedding Model ------------------
OPENCLIP_MODEL = "hf-hub:Marqo/marqo-ecommerce-embeddings-B"
clip_model, _, preprocess = open_clip.create_model_and_transforms(OPENCLIP_MODEL)
clip_model.eval().to(DEVICE)

# ------------------ Image Embedding Dataset ------------------
class ImageDataset(Dataset):
    def __init__(self, urls, sample_ids):
        self.urls = urls
        self.ids = sample_ids
    def __len__(self):
        return len(self.urls)
    def __getitem__(self, idx):
        url = self.urls[idx]
        sid = self.ids[idx]
        try:
            img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        except:
            img = Image.new("RGB", (224,224), color=(0,0,0))
        return {"img": img, "sample_id": sid}

def collate_fn(batch):
    filtered = [b for b in batch if b.get('img') is not None]
    if len(filtered) == 0:
        return None
    imgs = [b['img'] for b in filtered]
    ids = [b['sample_id'] for b in filtered]
    return {"imgs": imgs, "ids": ids}

img_dataset = ImageDataset(test_df["image_link"].tolist(), test_df["sample_id"].tolist())
img_loader = DataLoader(img_dataset, batch_size=1024, shuffle=False, collate_fn=collate_fn)

def compute_image_embeddings(loader, save_path):
    all_embs = []
    all_ids = []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Image embeddings"):
            if batch is None:
                continue
            imgs = batch['imgs']
            ids = batch['ids']
            inputs = torch.stack([preprocess(img) for img in imgs]).to(DEVICE)
            img_embs = clip_model.encode_image(inputs)
            img_embs = torch.nn.functional.normalize(img_embs, dim=-1)
            all_embs.append(img_embs.cpu().numpy())
            all_ids.extend(ids)
            # Optional: save intermediate embeddings per batch to avoid losing progress
            if len(all_embs) % 50 == 0:  # every 50 batches
                np.savez_compressed(save_path, embeddings=np.vstack(all_embs), sample_ids=np.array(all_ids))
    embeddings = np.vstack(all_embs)
    np.savez_compressed(save_path, embeddings=embeddings, sample_ids=np.array(all_ids))
    return embeddings, all_ids

# ------------------ Generate and Save image embeddings ------------------
img_emb_path = "/content/drive/MyDrive/AMAZON25/test_image_embs_marqo.npz"
print("Generating image embeddings...")
img_embs, img_ids = compute_image_embeddings(img_loader, img_emb_path)
IMAGE_DIM = img_embs.shape[1]
print(f"Saved image embeddings to {img_emb_path} | shape={img_embs.shape} | Image dim: {IMAGE_DIM}")

# ------------------ Load MLP Model ------------------
import torch.nn as nn

PROJ_DIM = 256
CAT_EMB_DIM = 32
num_unit_classes = len(le_unit.classes_)

class MultiModalRegressor(nn.Module):
    def __init__(self, name_dim, bullet_dim, img_dim, num_unit_classes, proj_dim=256, cat_emb_dim=32, dropout=0.3):
        super().__init__()
        self.proj_name = nn.Sequential(nn.Linear(name_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_bullet = nn.Sequential(nn.Linear(bullet_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_img = nn.Sequential(nn.Linear(img_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.unit_emb = nn.Embedding(num_unit_classes, cat_emb_dim)
        nn.init.xavier_uniform_(self.unit_emb.weight)
        self.value_proj = nn.Sequential(nn.Linear(1,32), nn.ReLU())
        total_dim = proj_dim*3 + cat_emb_dim + 32
        self.head = nn.Sequential(nn.Linear(total_dim,512), nn.ReLU(),
                                  nn.Dropout(dropout), nn.Linear(512,128), nn.ReLU(),
                                  nn.Dropout(dropout*0.8), nn.Linear(128,1))
    def forward(self, name_emb, bullet_emb, img_emb, value, unit_ids):
        n = self.proj_name(name_emb)
        b = self.proj_bullet(bullet_emb)
        i = self.proj_img(img_emb)
        u = self.unit_emb(unit_ids)
        v = self.value_proj(value)
        x = torch.cat([n,b,i,u,v], dim=1)
        out = self.head(x)
        return out.squeeze(1)

mlp_model = MultiModalRegressor(NAME_DIM, BULLET_DIM, IMAGE_DIM, num_unit_classes,
                                proj_dim=PROJ_DIM, cat_emb_dim=CAT_EMB_DIM, dropout=0.25).to(DEVICE)
mlp_model.load_state_dict(torch.load(MLP_PATH, map_location=DEVICE))
mlp_model.eval()

# ------------------ Prepare Tensors ------------------
value_tensor = torch.tensor(test_df["value"].values.reshape(-1,1), dtype=torch.float32).to(DEVICE)
unit_tensor = torch.tensor(unit_ids, dtype=torch.long).to(DEVICE)
name_tensor = torch.tensor(name_embs, dtype=torch.float32).to(DEVICE)
bullet_tensor = torch.tensor(bullet_embs, dtype=torch.float32).to(DEVICE)
img_tensor = torch.tensor(img_embs, dtype=torch.float32).to(DEVICE)

# ------------------ Batch Prediction ------------------
batch_size = 1024
preds = []

for i in tqdm(range(0, len(test_df), batch_size)):
    n = name_tensor[i:i+batch_size]
    b = bullet_tensor[i:i+batch_size]
    im = img_tensor[i:i+batch_size]
    v = value_tensor[i:i+batch_size]
    u = unit_tensor[i:i+batch_size]
    with torch.no_grad():
        batch_pred = mlp_model(n,b,im,v,u)
    preds.append(batch_pred.cpu().numpy())

preds = np.concatenate(preds)
prices = np.expm1(preds)  # log1p -> price

# ------------------ Save CSV ------------------
submission = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": prices
})
submission.to_csv("/content/drive/MyDrive/AMAZON25/test_submission51.csv", index=False)
print("✅ Submission CSV saved:", "/content/drive/MyDrive/datamain/test_submission1.csv")


Device: cuda
Loading Qwen small embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Text embedding dim: 1024
Loading item_name embeddings from /content/drive/MyDrive/AMAZON25/test_name_embs_qwen06.npy...
Loaded item_name embeddings | shape=(75000, 1024)
Loading bullet_points embeddings from /content/drive/MyDrive/AMAZON25/test_bullet_embs_qwen06.npy...
Loaded bullet_points embeddings | shape=(75000, 1024)


open_clip_config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

open_clip_model.safetensors:   0%|          | 0.00/813M [00:00<?, ?B/s]

Generating image embeddings...


Image embeddings:   0%|          | 0/74 [00:00<?, ?it/s]

Saved image embeddings to /content/drive/MyDrive/AMAZON25/test_image_embs_marqo.npz | shape=(75000, 768) | Image dim: 768


  0%|          | 0/74 [00:00<?, ?it/s]

✅ Submission CSV saved: /content/drive/MyDrive/datamain/test_submission1.csv


In [None]:
class MultiModalRegressor(nn.Module):
    def __init__(self, name_dim, bullet_dim, img_dim, num_unit_classes, proj_dim=256, cat_emb_dim=32, dropout=0.3):
        super().__init__()
        # projections
        self.proj_name = nn.Sequential(nn.Linear(name_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_bullet = nn.Sequential(nn.Linear(bullet_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())
        self.proj_img = nn.Sequential(nn.Linear(img_dim, proj_dim), nn.BatchNorm1d(proj_dim), nn.ReLU())

        # learnable unit embedding
        self.unit_emb = nn.Embedding(num_unit_classes, cat_emb_dim)
        nn.init.xavier_uniform_(self.unit_emb.weight)

        # numeric value projector
        self.value_proj = nn.Sequential(nn.Linear(1, 32), nn.ReLU())

        # gating layer (to weight modalities dynamically)
        self.gate = nn.Linear(proj_dim * 3, proj_dim * 3)

        # residual MLP block
        total_dim = proj_dim * 3 + cat_emb_dim + 32
        self.fc1 = nn.Linear(total_dim, total_dim)
        self.res_fc = nn.Linear(total_dim, total_dim)

        # head
        self.head = nn.Sequential(
            nn.Linear(total_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout * 0.8),
            nn.Linear(128, 1)
        )

    def forward(self, name_emb, bullet_emb, img_emb, value, unit_ids):
        n = self.proj_name(name_emb)
        b = self.proj_bullet(bullet_emb)
        i = self.proj_img(img_emb)
        u = self.unit_emb(unit_ids)
        v = self.value_proj(value)

        # Gating
        gates = torch.sigmoid(self.gate(torch.cat([n, b, i], dim=1)))
        n = n * gates[:, :n.size(1)]
        b = b * gates[:, n.size(1):n.size(1)+b.size(1)]
        i = i * gates[:, -i.size(1):]

        # Combine
        x = torch.cat([n, b, i, u, v], dim=1)

        # Residual block
        x = x + self.res_fc(F.relu(self.fc1(x)))

        # Prediction head
        out = self.head(x)
        return out.squeeze(1)

mlp_model = MultiModalRegressor(NAME_DIM, BULLET_DIM, IMAGE_DIM, num_unit_classes,
                                proj_dim=PROJ_DIM, cat_emb_dim=CAT_EMB_DIM, dropout=0.25).to(DEVICE)
mlp_model.load_state_dict(torch.load("/content/drive/MyDrive/AMAZON25/best_multimodal_mlp_qwen06_48.pth", map_location=DEVICE))
mlp_model.eval()

# ------------------ Prepare Tensors ------------------
value_tensor = torch.tensor(test_df["value"].values.reshape(-1,1), dtype=torch.float32).to(DEVICE)
unit_tensor = torch.tensor(unit_ids, dtype=torch.long).to(DEVICE)
name_tensor = torch.tensor(name_embs, dtype=torch.float32).to(DEVICE)
bullet_tensor = torch.tensor(bullet_embs, dtype=torch.float32).to(DEVICE)
img_tensor = torch.tensor(img_embs, dtype=torch.float32).to(DEVICE)

# ------------------ Batch Prediction ------------------
batch_size = 1024
preds = []

for i in tqdm(range(0, len(test_df), batch_size)):
    n = name_tensor[i:i+batch_size]
    b = bullet_tensor[i:i+batch_size]
    im = img_tensor[i:i+batch_size]
    v = value_tensor[i:i+batch_size]
    u = unit_tensor[i:i+batch_size]
    with torch.no_grad():
        batch_pred = mlp_model(n,b,im,v,u)
    preds.append(batch_pred.cpu().numpy())

preds = np.concatenate(preds)
prices = np.expm1(preds)  # log1p -> price

# ------------------ Save CSV ------------------
submission = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": prices
})
submission.to_csv("/content/drive/MyDrive/AMAZON25/test_submission51.csv", index=False)
print("✅ Submission CSV saved:", "/content/drive/MyDrive/datamain/test_submission1.csv")


  0%|          | 0/74 [00:00<?, ?it/s]

✅ Submission CSV saved: /content/drive/MyDrive/datamain/test_submission1.csv
