In [None]:
# ================================================================
# Multimodal Price Predictor (Text: GRU + Image: ResNet18 → MLP)
# Colab-ready: trains, validates, saves artifacts, and infers test.csv
# ================================================================

# ------------------------------
# 1) Minimal installs (Colab)
# ------------------------------
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip -q install numpy pandas scikit-learn pillow tqdm

# ------------------------------
# 2) Imports & global config
# ------------------------------
import os, re, io, time, random, urllib.request, hashlib, pathlib, warnings, math, json, pickle
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)
print("Device:", DEVICE)

# ------------------------------
# 3) Hyperparameters
# ------------------------------
# Text
MAX_VOCAB   = 80_000
SEQ_LEN     = 160
EMB_DIM     = 192
HID_DIM     = 256

# Image
IMG_SIZE    = 224
CNN_OUT     = 256

# Training
BASE_BATCH_GPU = 64
BASE_BATCH_CPU = 8
BATCH       = BASE_BATCH_GPU if DEVICE == "cuda" else BASE_BATCH_CPU
ACCUM_STEPS = 4 if DEVICE == "cuda" else 1
EPOCHS      = 10
LR          = 3e-4
WEIGHT_DECAY= 1e-4
EARLY_STOP  = 3
DROPOUT     = 0.2

# DataLoader perf
NUM_WORKERS = 4 if DEVICE == "cuda" else 2
PIN_MEMORY  = (DEVICE == "cuda")
PERSISTENT  = (DEVICE == "cuda")
PREFETCH    = 2

# ------------------------------
# 4) Metrics & text utils
# ------------------------------
def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    return 100 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + eps))

def smape_calibrate(y_true, y_pred, lo=0.7, hi=1.3, n=121):
    xs = np.linspace(lo, hi, n)
    best_s, best_v = 1.0, 1e9
    for s in xs:
        v = smape(y_true, y_pred * s)
        if v < best_v:
            best_s, best_v = float(s), float(v)
    return best_s, best_v

_tok_re = re.compile(r"[A-Za-z0-9]+(?:['\-][A-Za-z0-9]+)?|\d+(?:\.\d+)?|[^\s]")

def tokenize(text):
    return _tok_re.findall(str(text).lower())

def build_vocab(texts, max_size=MAX_VOCAB, min_freq=2):
    cnt = Counter()
    for t in texts:
        cnt.update(tokenize(t))
    itos = ["<pad>", "<unk>"]
    for tok, c in cnt.most_common():
        if c < min_freq or len(itos) >= max_size:
            break
        itos.append(tok)
    stoi = {w: i for i, w in enumerate(itos)}
    return stoi, itos

def encode_text_ids(text, stoi, seq_len=SEQ_LEN):
    PAD = stoi["<pad>"]
    UNK = stoi["<unk>"]
    ids = [stoi.get(t, UNK) for t in tokenize(text)][:seq_len]
    if len(ids) < seq_len:
        ids += [PAD] * (seq_len - len(ids))
    return np.asarray(ids, dtype=np.int64)

# ------------------------------
# 5) Image helpers
# ------------------------------
CACHE_DIR = pathlib.Path("./_img_cache")
CACHE_DIR.mkdir(exist_ok=True)

def url_to_path(u):
    return CACHE_DIR / (hashlib.md5(str(u).encode()).hexdigest() + ".jpg")

def fetch_image(u):
    """Fetch image with on-disk cache. Returns RGB PIL.Image."""
    p = url_to_path(u)
    if p.exists():
        try:
            return Image.open(p).convert("RGB")
        except Exception:
            p.unlink(missing_ok=True)
    try:
        req = urllib.request.Request(u, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=8) as r:
            b = r.read()
        img = Image.open(io.BytesIO(b)).convert("RGB")
        img.save(p)
        return img
    except Exception:
        # white placeholder if the URL fails
        return Image.fromarray(np.full((IMG_SIZE, IMG_SIZE, 3), 255, np.uint8), "RGB")

img_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std =[0.229, 0.224, 0.225]),
])

# ------------------------------
# 6) Load training data
# ------------------------------
from google.colab import files
uploaded = files.upload()            # choose your train.csv
fname = list(uploaded.keys())[0]

df = pd.read_csv(fname)
assert {"sample_id", "catalog_content", "image_link", "price"}.issubset(df.columns)

# Basic cleaning
df["catalog_content"] = (
    df["catalog_content"]
    .astype(str)
    .str.replace("\x00", " ")
    .str.strip()
    .str.lower()
)
df["price"] = pd.to_numeric(df["price"], errors="coerce").fillna(0.0).astype(float)

# Stratified split by price bins
q_bins = min(40, max(5, int(df["price"].nunique() // 10)))
bins = pd.qcut(df["price"], q=q_bins, duplicates="drop", labels=False)

df_tr, df_va = train_test_split(df, test_size=0.2, random_state=SEED, stratify=bins)
y_tr = df_tr["price"].values
y_va = df_va["price"].values

lo_clip = float(np.quantile(y_tr, 0.001))
hi_clip = float(np.quantile(y_tr, 0.999))
print(f"Train={len(df_tr)} | Valid={len(df_va)} | Clip=[{lo_clip:.2f}, {hi_clip:.2f}]")

# Vocab on train only; pre-encode for speed
stoi, itos = build_vocab(df_tr["catalog_content"].tolist(), MAX_VOCAB)
PAD = stoi["<pad>"]

df_tr_ids = np.vstack([encode_text_ids(t, stoi) for t in tqdm(df_tr["catalog_content"], desc="Encode train")])
df_va_ids = np.vstack([encode_text_ids(t, stoi) for t in tqdm(df_va["catalog_content"], desc="Encode valid")])

# ------------------------------
# 7) Dataset & DataLoaders
# ------------------------------
class PriceDataset(Dataset):
    def __init__(self, ids_array, frame):
        self.ids = ids_array
        self.f   = frame.reset_index(drop=True)

    def __len__(self):
        return len(self.f)

    def __getitem__(self, i):
        txt_ids = torch.from_numpy(self.ids[i])
        img     = img_tfms(fetch_image(self.f.iloc[i]["image_link"]))
        price   = torch.tensor(float(self.f.iloc[i]["price"]), dtype=torch.float32)
        return txt_ids, img, price

dl_tr = DataLoader(
    PriceDataset(df_tr_ids, df_tr),
    batch_size=BATCH, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=PERSISTENT, prefetch_factor=PREFETCH
)

dl_va = DataLoader(
    PriceDataset(df_va_ids, df_va),
    batch_size=BATCH, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=PERSISTENT, prefetch_factor=PREFETCH
)

# ------------------------------
# 8) Model (Text: GRU, Image: ResNet18 → MLP)
# ------------------------------
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, emb=EMB_DIM, hid=HID_DIM, bidir=True):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, emb, padding_idx=PAD)
        self.rnn  = nn.GRU(emb, hid, batch_first=True, bidirectional=bidir)
        self.drop = nn.Dropout(DROPOUT)
        self.out_dim = hid * (2 if bidir else 1)

    def forward(self, x):
        e, _ = self.emb(x), None
        h, _ = self.rnn(e)
        h_mean = h.mean(1)
        h_last = h[:, -1, :]
        return self.drop(torch.cat([h_mean, h_last], 1))

class ImageEncoder(nn.Module):
    def __init__(self, out_dim=CNN_OUT):
        super().__init__()
        net = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        # freeze earlier layers; fine-tune later ones
        for p in list(net.parameters())[:-10]:
            p.requires_grad = False
        self.backbone = nn.Sequential(*list(net.children())[:-1])  # [B,512,1,1]
        self.proj = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, out_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(DROPOUT)
        )
        self.out_dim = out_dim

    def forward(self, x):
        return self.proj(self.backbone(x))

class FusionRegressor(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.txt = TextEncoder(vocab_size)
        self.img = ImageEncoder()
        fused_dim = self.txt.out_dim * 2 + self.img.out_dim
        self.mlp = nn.Sequential(
            nn.Linear(fused_dim, MLP_HID), nn.ReLU(inplace=True), nn.Dropout(DROPOUT),
            nn.Linear(MLP_HID, 128),       nn.ReLU(inplace=True), nn.Dropout(DROPOUT),
            nn.Linear(128, 1)
        )

    def forward(self, txt_ids, imgs):
        t = self.txt(txt_ids)
        v = self.img(imgs)
        y = self.mlp(torch.cat([t, v], 1)).squeeze(1)
        return torch.relu(y)  # non-negative

model   = FusionRegressor(len(itos)).to(DEVICE)
opt     = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scaler  = torch.cuda.amp.GradScaler(enabled=(DEVICE == "cuda"))
loss_fn = nn.SmoothL1Loss(beta=1.0)

# ------------------------------
# 9) Train loop (with early stopping)
# ------------------------------
def run_train():
    best_smape = 1e9
    patience   = 0

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        t0 = time.time()

        # Train
        for step, (txt, img, price) in enumerate(dl_tr, 1):
            txt = txt.to(DEVICE, non_blocking=True)
            img = img.to(DEVICE, non_blocking=True)
            price = price.to(DEVICE, non_blocking=True)

            with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
                pred = model(txt, img)
                loss = loss_fn(pred, price) / ACCUM_STEPS

            scaler.scale(loss).backward()
            if step % ACCUM_STEPS == 0:
                scaler.unscale_(opt)
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(opt)
                scaler.update()
                opt.zero_grad(set_to_none=True)

            total_loss += loss.item() * price.size(0) * ACCUM_STEPS

        # Validate
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for txt, img, price in dl_va:
                txt = txt.to(DEVICE, non_blocking=True)
                img = img.to(DEVICE, non_blocking=True)
                with torch.cuda.amp.autocast(enabled=(DEVICE == "cuda")):
                    p = model(txt, img).detach().cpu().numpy()
                preds.append(p)
                trues.append(price.numpy())

        yp = np.clip(np.concatenate(preds), lo_clip, hi_clip)
        yt = np.concatenate(trues)
        s  = smape(yt, yp)

        print(f"Epoch {epoch:02d} | TrainLoss {total_loss/len(df_tr):.5f} | Val SMAPE {s:.3f} | {time.time()-t0:.1f}s")

        # Early stopping on SMAPE
        if s + 1e-4 < best_smape:
            best_smape, patience = s, 0
            os.makedirs("_artifacts", exist_ok=True)
            torch.save(model.state_dict(), "_artifacts/best_model.pt")
        else:
            patience += 1
            if patience >= EARLY_STOP:
                print("Early stopping.")
                break

run_train()

# ------------------------------
# 10) Save artifacts (config + vocab)
# ------------------------------
model.load_state_dict(torch.load("_artifacts/best_model.pt", map_location=DEVICE))

cfg = dict(
    seed=SEED, vocab=len(itos), seq_len=SEQ_LEN,
    emb=EMB_DIM, hid=HID_DIM,
    cnn_out=CNN_OUT, mlp_hid=MLP_HID, dropout=DROPOUT,
    clip_lo=lo_clip, clip_hi=hi_clip
)

json.dump(cfg, open("_artifacts/config.json", "w"), indent=2)
with open("_artifacts/vocab.pkl", "wb") as f:
    pickle.dump({"stoi": stoi, "itos": itos}, f)

print("✅ Saved model, vocab, config to ./_artifacts/")



Device: cuda


Saving train.csv to train (1).csv
Train=60000 | Valid=15000 | Clip=[0.63,329.99]


Encode train text: 100%|██████████| 60000/60000 [00:05<00:00, 10557.02it/s]
Encode valid text: 100%|██████████| 15000/15000 [00:01<00:00, 10912.04it/s]


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 242MB/s]


Epoch 01 | TrainLoss 14.54124 | Val SMAPE 30.225 | 1593.5s
Epoch 02 | TrainLoss 12.93052 | Val SMAPE 29.171 | 619.4s
Epoch 03 | TrainLoss 11.79212 | Val SMAPE 28.304 | 619.8s
Epoch 04 | TrainLoss 10.56964 | Val SMAPE 27.263 | 624.5s
Epoch 05 | TrainLoss 9.64634 | Val SMAPE 26.894 | 627.0s
Epoch 06 | TrainLoss 8.89018 | Val SMAPE 27.149 | 627.5s
Epoch 07 | TrainLoss 8.24322 | Val SMAPE 27.027 | 626.4s
Epoch 08 | TrainLoss 7.70333 | Val SMAPE 27.142 | 630.0s
Early stopping.
✅ Saved model, vocab, config to ./_artifacts/
✅ Predictions saved to dataset/test_out.csv
   sample_id  price
0     100179  17.86
1     245611  11.59
2     146263  31.17
3      95658   4.09
4      36806  16.09
