In [None]:
# ================================================================
# FEATURE-EXTRACTOR PIPELINE: (Frozen) Transformer Text + Vision → MLP
# - Fast & stable: extract embeddings, train small MLP on log(price)
# - Exports: emb_mlp.pt, embedder_config.json, sample_code.py
# - Works with dataset/{train.csv, test.csv}; image_link is URL
# ================================================================
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip -q install transformers pandas scikit-learn pillow tqdm

import os, io, re, json, time, random, hashlib, pathlib, urllib.request, warnings
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd
from tqdm import tqdm
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# ----------------- Repro & perf -----------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
print("✅ Device:", DEVICE)

# ----------------- Choose encoders -----------------
TEXT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"   # alt: "microsoft/deberta-v3-base"
VISION_BACKBONE = "swin_tiny"                           # alt: "convnext_tiny"

MAX_LEN = 128
IMG_SIZE = 224
BATCH = 128 if DEVICE=="cuda" else 32
EMB_TXT = None   # will infer
EMB_IMG = 384    # we project to this
MLP_HID = 512
LR = 3e-4
EPOCHS = 8
EARLY_STOP = 2
WEIGHT_DECAY = 1e-4

CACHE = pathlib.Path("_img_cache"); CACHE.mkdir(exist_ok=True)

# ----------------- Data -----------------
if not os.path.exists("train.csv"):
    try:
        from google.colab import files
        print("Please upload train.csv")
        uploaded = files.upload()
        assert "train.csv" in uploaded
    except Exception:
        raise FileNotFoundError("Place train.csv in working dir.")

df = pd.read_csv("train.csv")
df.columns = [c.lower().strip() for c in df.columns]
need = {"sample_id","catalog_content","image_link","price"}
miss = need - set(df.columns)
if miss: raise ValueError(f"train.csv missing columns: {sorted(miss)}")

df["catalog_content"] = df["catalog_content"].astype(str).str.replace("\x00"," ").str.strip()
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df = df[df["price"]>0].dropna(subset=["catalog_content","image_link","price"]).reset_index(drop=True)

bins = pd.qcut(df["price"], q=40, duplicates="drop", labels=False)
df_tr, df_va = train_test_split(df, test_size=0.2, random_state=SEED, stratify=bins)

lo_clip, hi_clip = float(np.quantile(df_tr["price"],0.001)), float(np.quantile(df_tr["price"],0.999))
df_tr["log_price"] = np.log1p(df_tr["price"])
df_va["log_price"] = np.log1p(df_va["price"])
print(f"Train={len(df_tr)} | Valid={len(df_va)} | Clip=[{lo_clip:.2f},{hi_clip:.2f}]")

# ----------------- Text tokenizer/encoder (frozen) -----------------
tok = AutoTokenizer.from_pretrained(TEXT_MODEL)
txt_model = AutoModel.from_pretrained(TEXT_MODEL).to(DEVICE).eval()
for p in txt_model.parameters(): p.requires_grad = False
EMB_TXT = txt_model.config.hidden_size

# ----------------- Vision backbone (frozen) -----------------
if VISION_BACKBONE == "swin_tiny":
    net = models.swin_t(weights=models.Swin_T_Weights.IMAGENET1K_V1)
    in_feat = net.head.in_features  # 768
    net.head = nn.Identity()
elif VISION_BACKBONE == "convnext_tiny":
    net = models.convnext_tiny(weights=models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1)
    in_feat = net.classifier[2].in_features  # 768
    # pool: global avg already inside .avgpool; to get 768 final, add head:
    headless = nn.Sequential(net.features, net.avgpool, nn.Flatten())
    class Wrapper(nn.Module):
        def __init__(self, body): super().__init__(); self.body=body
        def forward(self,x): return self.body(x)
    net = Wrapper(headless)
else:
    raise ValueError("Unsupported VISION_BACKBONE")

img_head = nn.Sequential(nn.Linear(in_feat, EMB_IMG), nn.ReLU(True), nn.Dropout(0.2)).to(DEVICE).eval()
net = net.to(DEVICE).eval()
for p in net.parameters(): p.requires_grad = False

img_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

def url_to_path(u): return CACHE/(hashlib.md5(u.encode()).hexdigest()+".jpg")
def fetch_image(u, timeout=8):
    p = url_to_path(u)
    if p.exists():
        try: return Image.open(p).convert("RGB")
        except: p.unlink(missing_ok=True)
    try:
        req = urllib.request.Request(u, headers={"User-Agent":"Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as r:
            b = r.read()
        img = Image.open(io.BytesIO(b)).convert("RGB")
        img.save(p, format="JPEG", quality=85)
        return img
    except:
        return Image.fromarray(np.full((IMG_SIZE,IMG_SIZE,3),255,np.uint8), "RGB")

# ----------------- Embedding dataset -----------------
class EmbedDS(Dataset):
    def __init__(self, frame): self.f=frame.reset_index(drop=True)
    def __len__(self): return len(self.f)
    def __getitem__(self,i):
        r=self.f.iloc[i]
        t = tok(r["catalog_content"], max_length=MAX_LEN, padding="max_length",
                truncation=True, return_tensors="pt")
        ids=t["input_ids"].squeeze(0); mask=t["attention_mask"].squeeze(0)
        img=img_tfms(fetch_image(r["image_link"]))
        y=torch.tensor(float(r["log_price"]),dtype=torch.float32)
        return ids,mask,img,y

dl_tr = DataLoader(EmbedDS(df_tr), batch_size=BATCH, shuffle=False, num_workers=4, pin_memory=True)
dl_va = DataLoader(EmbedDS(df_va), batch_size=BATCH, shuffle=False, num_workers=4, pin_memory=True)

@torch.no_grad()
def extract_embeddings(dloader):
    txt_list, img_list, y_list = [], [], []
    for ids,mask,img,y in tqdm(dloader, desc="Extracting"):
        ids,mask,img = ids.to(DEVICE), mask.to(DEVICE), img.to(DEVICE)
        # text
        out = txt_model(input_ids=ids, attention_mask=mask)
        txt_emb = out.last_hidden_state.mean(1)              # [B, Ht]
        # vision
        feat = net(img)
        if VISION_BACKBONE == "swin_tiny":
            vis_emb = img_head(feat)                         # [B, 384]
        else:
            vis_emb = img_head(feat)                         # [B, 384]
        txt_list.append(txt_emb.cpu().numpy())
        img_list.append(vis_emb.cpu().numpy())
        y_list.append(y.numpy())
    return (np.concatenate(txt_list), np.concatenate(img_list), np.concatenate(y_list))

Xtr_txt, Xtr_img, ytr = extract_embeddings(dl_tr)
Xva_txt, Xva_img, yva = extract_embeddings(dl_va)

Xtr = np.concatenate([Xtr_txt, Xtr_img], axis=1)
Xva = np.concatenate([Xva_txt, Xva_img], axis=1)
print("Embeddings:", Xtr.shape, Xva.shape)

# ----------------- MLP regressor on embeddings -----------------
class EmbMLP(nn.Module):
    def __init__(self, in_dim, hid=MLP_HID):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hid), nn.ReLU(True), nn.Dropout(0.2),
            nn.Linear(hid, 128),    nn.ReLU(True), nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

in_dim = Xtr.shape[1]
mlp = EmbMLP(in_dim).to(DEVICE)
opt = torch.optim.AdamW(mlp.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
loss_fn = nn.SmoothL1Loss(beta=1.0)

def smape(y_true, y_pred, eps=1e-8):
    yt, yp = np.expm1(y_true), np.expm1(y_pred)
    return 100*np.mean(np.abs(yp-yt)/(np.abs(yt)+np.abs(yp)+eps))

best,pat=1e9,0
for ep in range(1, EPOCHS+1):
    mlp.train(); t0=time.time()
    # torch tensors for faster epochs
    tb = 4096
    perm = np.random.permutation(len(Xtr))
    for s in range(0,len(Xtr),tb):
        idx = perm[s:s+tb]
        xb = torch.from_numpy(Xtr[idx]).float().to(DEVICE)
        yb = torch.from_numpy(ytr[idx]).float().to(DEVICE)
        opt.zero_grad(set_to_none=True)
        pred = mlp(xb)
        loss = loss_fn(pred, yb)
        loss.backward(); opt.step()
    # validate
    mlp.eval()
    with torch.no_grad():
        yp = []
        tb = 4096
        for s in range(0,len(Xva),tb):
            xb = torch.from_numpy(Xva[s:s+tb]).float().to(DEVICE)
            yp.append(mlp(xb).cpu().numpy())
        yp = np.concatenate(yp)
    sm = smape(yva, yp)
    print(f"Epoch {ep:02d} | Val SMAPE {sm:.2f} | {time.time()-t0:.1f}s")
    if sm < best-1e-3:
        best,pat = sm,0
        os.makedirs("artifacts", exist_ok=True)
        torch.save(mlp.state_dict(), "artifacts/emb_mlp.pt")
    else:
        pat += 1
        if pat >= EARLY_STOP: print("Early stopping."); break

# ----------------- Save embedder config -----------------
cfg = {
    "text_model": TEXT_MODEL,
    "vision_backbone": VISION_BACKBONE,
    "max_len": MAX_LEN,
    "img_size": IMG_SIZE,
    "emb_txt": EMB_TXT,
    "emb_img": EMB_IMG,
    "mlp_hid": MLP_HID,
    "lo_clip": float(lo_clip),
    "hi_clip": float(hi_clip),
}
with open("artifacts/embedder_config.json","w") as f: json.dump(cfg, f, indent=2)
print("\n✅ Saved: artifacts/emb_mlp.pt & artifacts/embedder_config.json")

# ----------------- sample_code.py (inference) -----------------
sample_code = r"""
import os, io, json, hashlib, warnings, urllib.request, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import torch, torch.nn as nn
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel

DEVICE="cpu"  # evaluator-safe
ART_DIR="artifacts"
CFG = json.load(open(os.path.join(ART_DIR,"embedder_config.json")))
CKPT = os.path.join(ART_DIR,"emb_mlp.pt")

# --- text/vision encoders (frozen) ---
tok = AutoTokenizer.from_pretrained(CFG["text_model"])
txt_model = AutoModel.from_pretrained(CFG["text_model"]).to(DEVICE).eval()
for p in txt_model.parameters(): p.requires_grad=False

if CFG["vision_backbone"]=="swin_tiny":
    net = models.swin_t(weights=models.Swin_T_Weights.IMAGENET1K_V1)
    in_feat = net.head.in_features
    net.head = nn.Identity()
    net = net.to(DEVICE).eval()
    head = nn.Sequential(nn.Linear(in_feat, CFG["emb_img"]), nn.ReLU(True), nn.Dropout(0.2)).to(DEVICE).eval()
elif CFG["vision_backbone"]=="convnext_tiny":
    net = models.convnext_tiny(weights=models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1)
    in_feat = net.classifier[2].in_features
    headless = nn.Sequential(net.features, net.avgpool, nn.Flatten())
    class Wrap(nn.Module):
        def __init__(self,b): super().__init__(); self.b=b
        def forward(self,x): return self.b(x)
    net = Wrap(headless).to(DEVICE).eval()
    head = nn.Sequential(nn.Linear(in_feat, CFG["emb_img"]), nn.ReLU(True), nn.Dropout(0.2)).to(DEVICE).eval()
else:
    raise ValueError("bad backbone")

for p in net.parameters(): p.requires_grad=False
for p in head.parameters(): p.requires_grad=False

# --- MLP ---
class EmbMLP(nn.Module):
    def __init__(self, in_dim, hid):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hid), nn.ReLU(True), nn.Dropout(0.2),
            nn.Linear(hid, 128),    nn.ReLU(True), nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
    def forward(self,x): return self.net(x).squeeze(1)

in_dim = CFG["emb_txt"] + CFG["emb_img"]
mlp = EmbMLP(in_dim, CFG["mlp_hid"]).to(DEVICE).eval()
mlp.load_state_dict(torch.load(CKPT, map_location=DEVICE), strict=True)

# --- image utils ---
IMG_SIZE = int(CFG["img_size"])
def _u2p(u):
    import pathlib
    c = pathlib.Path("_img_cache"); c.mkdir(exist_ok=True)
    return c/(hashlib.md5(u.encode()).hexdigest()+".jpg")
def _fetch(u, timeout=8):
    p = _u2p(u)
    if os.path.exists(p):
        try: return Image.open(p).convert("RGB")
        except:
            try: os.remove(p)
            except: pass
    try:
        req = urllib.request.Request(u, headers={"User-Agent":"Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as r: b=r.read()
        img = Image.open(io.BytesIO(b)).convert("RGB"); img.save(p, format="JPEG", quality=85)
        return img
    except:
        return Image.fromarray(np.full((IMG_SIZE,IMG_SIZE,3),255,np.uint8),"RGB")

tfm = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

def _embed_text(text):
    t = tok(str(text).replace("\\x00"," ").strip(), max_length=int(CFG["max_len"]),
            padding="max_length", truncation=True, return_tensors="pt")
    with torch.no_grad():
        out = txt_model(input_ids=t["input_ids"].to(DEVICE),
                        attention_mask=t["attention_mask"].to(DEVICE))
        return out.last_hidden_state.mean(1).cpu().numpy()  # [1, emb_txt]

def _embed_image(url):
    img = tfm(_fetch(str(url))).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        feat = net(img)
        return head(feat).cpu().numpy()                      # [1, emb_img]

def predictor(sample_id, catalog_content, image_link):
    te = _embed_text(catalog_content)
    ie = _embed_image(image_link)
    x = np.concatenate([te, ie], axis=1)                    # [1, emb_txt+emb_img]
    with torch.no_grad():
        pred_log = mlp(torch.from_numpy(x).float()).cpu().numpy()[0]
    price = float(np.expm1(pred_log))
    price = float(np.clip(price, CFG["lo_clip"], CFG["hi_clip"]))
    return round(price, 2)

if __name__ == "__main__":
    DATASET_FOLDER = "dataset"
    test_path = os.path.join(DATASET_FOLDER, "test.csv")
    if not os.path.exists(test_path):
        raise FileNotFoundError("Place test.csv at dataset/test.csv")
    test = pd.read_csv(test_path)
    for c in ["sample_id","catalog_content","image_link"]:
        if c not in test.columns: raise ValueError(f"test.csv missing column: {c}")
    test["price"] = test.apply(lambda r: predictor(r["sample_id"], r["catalog_content"], r["image_link"]), axis=1)
    out = test[["sample_id","price"]]
    out_path = os.path.join(DATASET_FOLDER,"test_out.csv")
    out.to_csv(out_path, index=False)
    print(f"Predictions saved to {out_path}")
"""

with open("sample_code.py","w") as f:
    f.write(sample_code)

print("\n✅ Wrote sample_code.py. In Colab terminal you can run:")
print("!python sample_code.py   # (expects dataset/test.csv)")


✅ Device: cuda
Train=60000 | Valid=15000 | Clip=[0.63,329.99]


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/swin_t-704ceda3.pth" to /root/.cache/torch/hub/checkpoints/swin_t-704ceda3.pth


100%|██████████| 108M/108M [00:00<00:00, 245MB/s] 
Extracting: 100%|██████████| 469/469 [20:34<00:00,  2.63s/it]
Extracting: 100%|██████████| 118/118 [05:27<00:00,  2.77s/it]


Embeddings: (60000, 768) (15000, 768)
Epoch 01 | Val SMAPE 44.88 | 0.6s
Epoch 02 | Val SMAPE 38.08 | 0.1s
Epoch 03 | Val SMAPE 36.10 | 0.1s
Epoch 04 | Val SMAPE 34.93 | 0.1s
Epoch 05 | Val SMAPE 34.04 | 0.1s
Epoch 06 | Val SMAPE 33.50 | 0.1s
Epoch 07 | Val SMAPE 33.00 | 0.1s
Epoch 08 | Val SMAPE 32.55 | 0.1s

✅ Saved: artifacts/emb_mlp.pt & artifacts/embedder_config.json

✅ Wrote sample_code.py. In Colab terminal you can run:
!python sample_code.py   # (expects dataset/test.csv)


In [None]:
!python sample_code.py


2025-10-13 17:29:31.611448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760376571.634091    9813 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760376571.640924    9813 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760376571.657687    9813 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760376571.657719    9813 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1760376571.657723    9813 computation_placer.cc:177] computation placer alr