In [None]:
# ================================================================
# ONE-CELL: MiniLM (Text) + Swin-Tiny (Vision) → MLP
# Predict Product Price (train + eval + artifact save)
# Compact, Colab-ready, GPU-optimized
# ================================================================
!pip -q install torch torchvision torchaudio transformers pandas scikit-learn pillow tqdm --index-url https://download.pytorch.org/whl/cu121

import os, io, re, json, time, random, hashlib, urllib.request, warnings, pathlib
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ----------------- Repro & perf -----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
print("✅ Device:", DEVICE)

# ----------------- Config -----------------
TEXT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
IMG_SIZE = 224
IMG_PROJ = 384
MLP_HID  = 512
BATCH = 96 if DEVICE=="cuda" else 16
LR = 3e-4
EPOCHS = 6
WEIGHT_DECAY = 1e-4
EARLY_STOP = 2

# ----------------- Load CSV -----------------
if not os.path.exists("train.csv"):
    from google.colab import files
    print("Please upload train.csv")
    uploaded = files.upload()
    assert "train.csv" in uploaded

df = pd.read_csv("train.csv")
df.columns = [c.lower().strip() for c in df.columns]
need = {"sample_id","catalog_content","image_link","price"}
if not need.issubset(df.columns):
    raise ValueError(f"Missing columns: {need - set(df.columns)}")

df["catalog_content"] = df["catalog_content"].astype(str).fillna("").str.replace("\x00"," ").str.strip()
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df = df[df["price"]>0].dropna(subset=["catalog_content","image_link","price"]).reset_index(drop=True)
print("Rows:", len(df))

# ----------------- Split + Clip -----------------
bins = pd.qcut(df["price"], q=40, duplicates="drop", labels=False)
df_tr, df_va = train_test_split(df, test_size=0.2, random_state=SEED, stratify=bins)
lo_clip, hi_clip = float(np.quantile(df_tr["price"], 0.001)), float(np.quantile(df_tr["price"], 0.999))
df_tr["log_price"] = np.log1p(df_tr["price"])
df_va["log_price"] = np.log1p(df_va["price"])
print(f"Train={len(df_tr)} | Valid={len(df_va)} | Clip=[{lo_clip:.2f},{hi_clip:.2f}]")

# ----------------- Tokenizer -----------------
tok = AutoTokenizer.from_pretrained(TEXT_MODEL)
MAX_LEN = 128

# ----------------- Image caching -----------------
CACHE = pathlib.Path("_img_cache"); CACHE.mkdir(exist_ok=True)
def url_to_path(u): return CACHE / (hashlib.md5(u.encode()).hexdigest()+".jpg")

def fetch_image(u, timeout=8):
    p = url_to_path(u)
    if p.exists():
        try: return Image.open(p).convert("RGB")
        except: p.unlink(missing_ok=True)
    try:
        req = urllib.request.Request(u, headers={"User-Agent":"Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as r: b = r.read()
        img = Image.open(io.BytesIO(b)).convert("RGB")
        img.save(p, format="JPEG", quality=85)
        return img
    except:
        return Image.fromarray(np.full((IMG_SIZE,IMG_SIZE,3),255,np.uint8),"RGB")

img_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

# ----------------- Dataset -----------------
class PriceDataset(Dataset):
    def __init__(self, df): self.df=df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        r=self.df.iloc[i]
        t = tok(r["catalog_content"],max_length=MAX_LEN,padding="max_length",truncation=True,return_tensors="pt")
        img = img_tfms(fetch_image(r["image_link"]))
        y = torch.tensor(float(r["log_price"]),dtype=torch.float32)
        return t["input_ids"].squeeze(0), t["attention_mask"].squeeze(0), img, y

dl_tr = DataLoader(PriceDataset(df_tr),batch_size=BATCH,shuffle=True,num_workers=4,pin_memory=True)
dl_va = DataLoader(PriceDataset(df_va),batch_size=BATCH,shuffle=False,num_workers=4,pin_memory=True)



✅ Device: cuda
Rows: 75000
Train=60000 | Valid=15000 | Clip=[0.63,329.99]


In [None]:
# ----------------- Model -----------------
class TextEncoder(nn.Module):
    def __init__(self, model_name=TEXT_MODEL):
        super().__init__()
        self.tx = AutoModel.from_pretrained(model_name)
        self.out_dim = self.tx.config.hidden_size
    def forward(self, ids, mask):
        out = self.tx(input_ids=ids,attention_mask=mask,output_hidden_states=False)
        pooled = out.last_hidden_state.mean(1)
        return pooled

class SwinTinyEncoder(nn.Module):
    def __init__(self, out_dim=IMG_PROJ):
        super().__init__()
        net=models.swin_t(weights=models.Swin_T_Weights.IMAGENET1K_V1)
        in_feat=net.head.in_features
        net.head=nn.Identity()
        self.backbone=net
        self.head=nn.Sequential(nn.Linear(in_feat,out_dim),nn.ReLU(),nn.Dropout(0.2))
        self.out_dim=out_dim
    def forward(self,x):
        return self.head(self.backbone(x))

class PriceRegressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.txt=TextEncoder()
        self.img=SwinTinyEncoder()
        fused=self.txt.out_dim+self.img.out_dim
        self.mlp=nn.Sequential(
            nn.Linear(fused,MLP_HID),nn.ReLU(),nn.Dropout(0.2),
            nn.Linear(MLP_HID,128),nn.ReLU(),nn.Dropout(0.2),
            nn.Linear(128,1)
        )
    def forward(self,ids,mask,img):
        t=self.txt(ids,mask); v=self.img(img)
        return self.mlp(torch.cat([t,v],1)).squeeze(1)

model=PriceRegressor().to(DEVICE)
opt=torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
scaler=torch.cuda.amp.GradScaler(enabled=(DEVICE=="cuda"))
loss_fn=nn.SmoothL1Loss(beta=1.0)

# ----------------- Metrics -----------------
def smape_np(y_true,y_pred,eps=1e-8):
    y_true,y_pred=np.expm1(y_true),np.expm1(y_pred)
    return 100*np.mean(np.abs(y_pred-y_true)/(np.abs(y_true)+np.abs(y_pred)+eps))

@torch.no_grad()
def evaluate():
    model.eval(); preds,trues=[],[]
    for ids,mask,img,y in dl_va:
        ids,mask,img=ids.to(DEVICE),mask.to(DEVICE),img.to(DEVICE)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            pred=model(ids,mask,img).detach().cpu().numpy()
        preds.append(pred); trues.append(y.numpy())
    yp=np.concatenate(preds); yt=np.concatenate(trues)
    yp=np.clip(np.expm1(yp),lo_clip,hi_clip); yt=np.expm1(yt)
    return smape_np(np.log1p(yt),np.log1p(yp)),yt,yp

# ----------------- Train -----------------
best,pat=1e9,0
for ep in range(1,EPOCHS+1):
    model.train(); total=0; n=0; t0=time.time()
    for ids,mask,img,y in tqdm(dl_tr,desc=f"Epoch {ep:02d}"):
        ids,mask,img,y=ids.to(DEVICE),mask.to(DEVICE),img.to(DEVICE),y.to(DEVICE)
        opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=(DEVICE=="cuda")):
            pred=model(ids,mask,img)
            loss=loss_fn(pred,y)
        scaler.scale(loss).backward(); scaler.step(opt); scaler.update()
        total+=loss.item()*len(y); n+=len(y)
    sm,yt,yp=evaluate()
    print(f"Epoch {ep:02d} | TrainLoss {total/max(1,n):.4f} | Val SMAPE {sm:.2f} | {time.time()-t0:.1f}s")
    if sm<best-1e-3:
        best,pat=sm,0
        os.makedirs("artifacts",exist_ok=True)
        torch.save(model.state_dict(),"artifacts/best_multimodal.pt")
    else:
        pat+=1
        if pat>=EARLY_STOP: print("Early stopping."); break

# ----------------- Save artifacts -----------------
with open("artifacts/config.json","w") as f:
    json.dump({
        "text_model":TEXT_MODEL,"img_proj":IMG_PROJ,"mlp_hid":MLP_HID,
        "max_len":MAX_LEN,"img_size":IMG_SIZE,
        "lo_clip":lo_clip,"hi_clip":hi_clip
    },f,indent=2)
print("\n✅ Artifacts saved: best_multimodal.pt + config.json")

# ----------------- Final Eval -----------------
state=torch.load("artifacts/best_multimodal.pt",map_location=DEVICE)
model.load_state_dict(state)
model.eval()
sm,yt,yp=evaluate()
mae=mean_absolute_error(yt,yp); rmse=np.sqrt(mean_squared_error(yt,yp))
print(f"\n=== Validation Summary ===\nSMAPE={sm:.2f} | MAE={mae:.2f} | RMSE={rmse:.2f}")


Epoch 01: 100%|██████████| 625/625 [36:52<00:00,  3.54s/it]


Epoch 01 | TrainLoss 0.3406 | Val SMAPE 29.01 | 2766.3s


Epoch 02: 100%|██████████| 625/625 [09:35<00:00,  1.09it/s]


Epoch 02 | TrainLoss 0.2672 | Val SMAPE 29.05 | 719.8s


Epoch 03: 100%|██████████| 625/625 [09:26<00:00,  1.10it/s]


Epoch 03 | TrainLoss 0.2262 | Val SMAPE 26.57 | 709.0s


Epoch 04: 100%|██████████| 625/625 [09:30<00:00,  1.09it/s]


Epoch 04 | TrainLoss 0.2045 | Val SMAPE 27.18 | 711.5s


Epoch 05: 100%|██████████| 625/625 [09:28<00:00,  1.10it/s]


Epoch 05 | TrainLoss 0.1820 | Val SMAPE 26.06 | 712.3s


Epoch 06: 100%|██████████| 625/625 [09:16<00:00,  1.12it/s]


Epoch 06 | TrainLoss 0.1626 | Val SMAPE 26.31 | 693.2s

✅ Artifacts saved: best_multimodal.pt + config.json

=== Validation Summary ===
SMAPE=26.06 | MAE=11.55 | RMSE=23.68
