In [None]:
# Standard libs
import os, sys, time, math, json, inspect, argparse, random, re
from pathlib import Path

# PyData / ML
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Tokenization (no extra downloads required)
from nltk.tokenize import WordPunctTokenizer

# Hugging Face datasets
try:
    from datasets import load_dataset
except Exception as e:
    raise RuntimeError(
        "Please install 'datasets' first, e.g. `pip install datasets`"
    ) from e

# ---- Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)


In [None]:
print("CUDA available:", torch.cuda.is_available())
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from pathlib import Path

IN_KAGGLE = Path("/kaggle").exists()

if IN_KAGGLE:
    INPUT_ROOT = Path("/kaggle/input")
    WORK_ROOT  = Path("/kaggle/working")

    STOPWORDS = INPUT_ROOT / "digital-music-5" / "stopwords.txt"
    PUNCTS    = INPUT_ROOT / "digital-music-5" / "punctuations.txt"
    GLOVE     = INPUT_ROOT / "glove6b100dtxt" / "glove.6B.100d.txt"

    RAW_ALL_BEAUTY = INPUT_ROOT / "all-beauty" / "All_Beauty.jsonl"
    LOCAL_ST_MODEL = INPUT_ROOT / "minilm-l6-v2-local" / "all-MiniLM-L6-v2"
else:
    PROJECT_ROOT = Path.cwd()

    # if the folders are directly under the project root:
    RAW_ALL_BEAUTY = PROJECT_ROOT / "All_Beauty" / "All_Beauty.jsonl"
    GLOVE          = PROJECT_ROOT / "glove.6B.100d.txt" / "glove.6B.100d.txt"
    STOPWORDS      = PROJECT_ROOT / "Digital_Music_5" / "stopwords.txt"
    PUNCTS         = PROJECT_ROOT / "Digital_Music_5" / "punctuations.txt"
    # pointing to the directory that contains config.json, tokenizer.json, etc.
    LOCAL_ST_MODEL = PROJECT_ROOT / "all-MiniLM-L6-v2" / "all-MiniLM-L6-v2"

    ARTIFACTS = PROJECT_ROOT / "artifacts"

# Output directories as Path
WORK_DIR      = WORK_ROOT if IN_KAGGLE else ARTIFACTS
SAVE_DATA_DIR = WORK_DIR / "Amazon_Fashion"
MODEL_DIR     = WORK_DIR / "model"
FIG_DIR       = WORK_DIR / "fig"
ST_CACHE_DIR  = WORK_DIR / "st_cache"

# Create directories
for d in [SAVE_DATA_DIR, MODEL_DIR, FIG_DIR, ST_CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)


In [None]:
for p in [RAW_ALL_BEAUTY, GLOVE, STOPWORDS, PUNCTS, LOCAL_ST_MODEL]:
    print(p, "exists:", p.exists())

In [None]:
import os
import pandas as pd
from datasets import load_dataset  # optional fallback
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer


def load_all_beauty_local(jsonl_path: str | Path | None = None):
    """
    Load All_Beauty.jsonl (Kaggle or local) and return a pandas DataFrame
    with standardized columns: userID, itemID, review, rating.
    """
    if jsonl_path is None:
        jsonl_path = RAW_ALL_BEAUTY
    jsonl_path = Path(jsonl_path)

    needed = ("user_id", "asin", "text", "rating")

    def _standardize_cols(df):
        alt_map = {
            "reviewText": "text",
            "overall": "rating",
            "user": "user_id",
            "item": "asin",
        }
        for old, new in alt_map.items():
            if old in df.columns and new not in df.columns:
                df[new] = df[old]

        missing = [c for c in needed if c not in df.columns]
        if missing:
            raise KeyError(
                f"Missing required columns {missing}. "
                "Make sure your JSONL has keys like: user_id, asin, text, rating."
            )

        df = df[list(needed)].copy()
        df.columns = ["userID", "itemID", "review", "rating"]
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
        df = df[df["rating"].notnull()]
        df = df[df["review"].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]
        df.reset_index(drop=True, inplace=True)
        return df



    try:
        df = pd.read_json(jsonl_path, lines=True)
        return _standardize_cols(df)
    except Exception as e_pd:
        try:
            ds = load_dataset("json", data_files=str(jsonl_path), split="train")
            df = ds.to_pandas()
            return _standardize_cols(df)
        except Exception as e_hf:
            raise RuntimeError(
                f"Failed to load JSONL via pandas ({type(e_pd).__name__}: {e_pd}) "
                f"and datasets ({type(e_hf).__name__}: {e_hf})."
            )


In [None]:
# --- keep your helpers as-is ---
def _read_list(path):
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Required file not found: {path}\n"
            "Place a plain-text file with one token per line."
        )
    with open(path, encoding="utf-8") as f:
        return set(ln.strip() for ln in f if ln.strip())

def process_df_to_csv(df, stopwords_path, puncts_path, train_rate, csv_path):
    # Map IDs to contiguous integers
    df["userID"] = df["userID"].astype("category").cat.codes
    df["itemID"] = df["itemID"].astype("category").cat.codes

    # Load stopwords/punctuations
    stop_words   = _read_list(stopwords_path)
    punctuations = _read_list(puncts_path)
    tok = WordPunctTokenizer()

    def clean_review(review: str) -> str:
        rv = review.lower()
        for p in punctuations:
            rv = rv.replace(p, " ")
        toks = tok.tokenize(rv)
        toks = [w for w in toks if w not in stop_words]
        return " ".join(toks)

    print("#### Cleaning text (this can take a while on large splits)...")
    df["review"] = df["review"].apply(clean_review)

    # Train/valid/test split
    train_df, valid_test_df = train_test_split(df, test_size=1 - train_rate, random_state=3)
    valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=4)

    os.makedirs(csv_path, exist_ok=True)
    train_df.to_csv(os.path.join(csv_path, "train.csv"), index=False, header=False)
    valid_df.to_csv(os.path.join(csv_path, "valid.csv"), index=False, header=False)
    test_df .to_csv(os.path.join(csv_path, "test.csv"),  index=False, header=False)

    print(f"#### Saved CSVs to {csv_path}")
    print(f"#### Split sizes: train {len(train_df)}, valid {len(valid_df)}, test {len(test_df)}")
    print(f"#### Totals: {len(df)} reviews, {df['userID'].nunique()} users, {df['itemID'].nunique()} items.")
    return train_df, valid_df, test_df


In [None]:
csv_train = SAVE_DATA_DIR / "train.csv"

if not csv_train.exists():
    df_raw = load_all_beauty_local()  # uses Kaggle or local automatically
    _ = process_df_to_csv(
        df_raw,
        stopwords_path=STOPWORDS,
        puncts_path=PUNCTS,
        train_rate=0.8,
        csv_path=SAVE_DATA_DIR,
    )
else:
    print("CSV files already exist — skipping reprocessing.")


In [None]:
def now(f='%Y-%m-%d %H:%M:%S'):
    return time.strftime(f, time.localtime())

class Config:
    # Device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Training
    train_epochs        = 5
    batch_size          = 128
    learning_rate       = 2e-3
    l2_regularization   = 1e-6
    learning_rate_decay = 0.99
    patience            = 3

    # Files
    word2vec_file = GLOVE
    train_file    = os.path.join(SAVE_DATA_DIR, 'train.csv')
    valid_file    = os.path.join(SAVE_DATA_DIR, 'valid.csv')
    test_file     = os.path.join(SAVE_DATA_DIR, 'test.csv')
    model_file    = os.path.join(MODEL_DIR, 'best_model.pt')

    # Data shaping
    review_count         = 10    # number of reviews per side
    review_length        = 40    # tokens per review
    lowest_review_count  = 2
    PAD_WORD             = '<UNK>'

    # Model sizes
    kernel_count = 100
    kernel_size  = 3
    dropout_prob = 0.5
    cnn_out_dim  = 50

    def __init__(self):
        # Allow CLI/nb override (no-op by default)
        attributes = inspect.getmembers(self, lambda a: not inspect.isfunction(a))
        attributes = list(filter(lambda x: not x[0].startswith('__'), attributes))
        parser = argparse.ArgumentParser(add_help=False)
        for key, val in attributes:
            parser.add_argument('--' + key, dest=key, type=type(val), default=val)
        args, _ = parser.parse_known_args([])
        for key, val in args.__dict__.items():
            setattr(self, key, val)

    def __str__(self):
        attributes = inspect.getmembers(self, lambda a: not inspect.isfunction(a))
        attributes = list(filter(lambda x: not x[0].startswith('__'), attributes))
        return "\n".join([f"{k} = {v}" for k, v in attributes])

In [None]:
# Cell 1: imports & small utils
import os, json, re, csv, hashlib
from pathlib import Path
from collections import defaultdict

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def simple_sentence_split(text: str, max_sentences: int):
    """Split on . ! ? and keep up to max_sentences."""
    if not isinstance(text, str) or not text.strip():
        return []
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    sents = [s.strip() for s in parts if s.strip()]
    return sents[:max_sentences]

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

In [None]:
def read_split(csv_path: str):
    """Reads CSV with no header: userID,itemID,review,rating"""
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        rdr = csv.reader(f)
        for user_id, item_id, review, rating in rdr:
            rows.append((int(user_id), int(item_id), review, float(rating)))
    return rows

In [None]:
def precompute_sent_embeddings(
    train_csv: str,
    valid_csv: str,
    test_csv: str,
    out_dir: str,
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
    S: int = 40,
    batch_size: int = 1024,
    max_len: int = 128,
    lowercase: bool = False,
    local_model_dir: str | None = "/kaggle/input/minilm-l6-v2-local/all-MiniLM-L6-v2",  # <-- your local path
):
    """
    Notebook version of precompute_sent_embeddings.py
    Writes:
      out_dir/
        embeddings.npy
        sentences.jsonl
        review2sent_ids.jsonl
        splits/{train,valid,test}/ (rows.jsonl, user_reviews.json, item_reviews.json)
    """
    out_dir = Path(out_dir)
    ensure_dir(out_dir)
    splits_dir = out_dir / "splits"
    ensure_dir(splits_dir)

    # 1) Read all splits
    split_paths = {"train": Path(train_csv), "valid": Path(valid_csv), "test": Path(test_csv)}
    splits = {k: read_split(str(v)) for k, v in split_paths.items()}

    # 2) Collect unique reviews + per-split manifests
    all_reviews = {}   # review_hash -> raw_review_text
    split_rows = {}    # split -> list[dict]
    user_reviews = {}  # split -> {userID: [review_hash,...]}
    item_reviews = {}  # split -> {itemID: [review_hash,...]}

    for split, rows in splits.items():
        s_rows = []
        u_map = defaultdict(list)
        i_map = defaultdict(list)
        for (u, it, review, rating) in rows:
            rtxt = review.lower() if lowercase else review
            r_hash = sha1(rtxt)
            all_reviews.setdefault(r_hash, rtxt)
            s_rows.append({"userID": u, "itemID": it, "rating": rating, "review_hash": r_hash})
            u_map[u].append(r_hash)
            i_map[it].append(r_hash)
        split_rows[split] = s_rows
        user_reviews[split] = {str(k): v for k, v in u_map.items()}
        item_reviews[split] = {str(k): v for k, v in i_map.items()}

    # 3) Build deduped sentence inventory (cap to S per review for downstream speed)
    sentence_to_id = {}
    sentences = []  # index -> text
    review_to_sentids = {}  # review_hash -> fixed-length [int] of len S, with -1 as PAD

    def get_sent_id(s):
        if s not in sentence_to_id:
            sentence_to_id[s] = len(sentences)
            sentences.append(s)
        return sentence_to_id[s]

    for r_hash, rtxt in all_reviews.items():
        sents = simple_sentence_split(rtxt, S)
        ids = [get_sent_id(s) for s in sents]
        ids = (ids + [-1] * (S - len(ids))) if len(ids) < S else ids[:S]
        review_to_sentids[r_hash] = ids

    print(f"[build] Unique reviews: {len(all_reviews):,}")
    print(f"[build] Unique sentences: {len(sentences):,}")

    # 4) Encode all unique sentences (with progress bar)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Prefer local directory (offline-safe)
    try:
        if local_model_dir is None:
            raise FileNotFoundError("No local_model_dir provided.")
        # quick sanity: require at least model + tokenizer files
        needed = ["tokenizer.json", "special_tokens_map.json"]
        missing = [f for f in needed if not os.path.exists(os.path.join(local_model_dir, f))]
        if missing:
            raise FileNotFoundError(f"Missing in local model dir: {missing}")
        tok = AutoTokenizer.from_pretrained(local_model_dir, local_files_only=True)
        enc = AutoModel.from_pretrained(local_model_dir, local_files_only=True).to(device)
    except Exception as e:
        # Optional: fallback to hub if you ever run with internet
        print("[warn] Local model load failed, attempting hub:", e)
        tok = AutoTokenizer.from_pretrained(model_name)
        enc = AutoModel.from_pretrained(model_name).to(device)
   
    
    enc.eval()
    for p in enc.parameters():
        p.requires_grad = False

    H = enc.config.hidden_size
    N = len(sentences)
    EMB = np.memmap(out_dir / "embeddings.npy", dtype="float32", mode="w+", shape=(N, H))

    def mean_pool(last_hidden_state, attention_mask):
        mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
        summed = (last_hidden_state * mask).sum(dim=1)
        counts = mask.sum(dim=1).clamp(min=1.0)
        return summed / counts

    with torch.no_grad():
        rng = range(0, N, batch_size)
        for start in tqdm(rng, desc="Encoding sentences", unit="batch"):
            end = min(start + batch_size, N)
            batch_texts = sentences[start:end]
            batch = tok(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt",
            ).to(device)
            out = enc(**batch)
            pooled = mean_pool(out.last_hidden_state, batch["attention_mask"]).detach().cpu().numpy().astype("float32")
            EMB[start:end, :] = pooled

    # Flush memmap
    del EMB
    print("[encode] embeddings.npy written.")

    # 5) Write metadata files
    with open(out_dir / "sentences.jsonl", "w", encoding="utf-8") as f:
        for sid, txt in enumerate(sentences):
            f.write(json.dumps({"sent_id": sid, "text": txt}, ensure_ascii=False) + "\n")

    with open(out_dir / "review2sent_ids.jsonl", "w", encoding="utf-8") as f:
        for r_hash, ids in review_to_sentids.items():
            f.write(json.dumps({"review_hash": r_hash, "sent_ids": ids}) + "\n")

    # Per-split artifacts
    for split in ["train", "valid", "test"]:
        sp = splits_dir / split
        ensure_dir(sp)
        with open(sp / "rows.jsonl", "w", encoding="utf-8") as f:
            for row in split_rows[split]:
                f.write(json.dumps(row) + "\n")
        with open(sp / "user_reviews.json", "w", encoding="utf-8") as f:
            json.dump(user_reviews[split], f)
        with open(sp / "item_reviews.json", "w", encoding="utf-8") as f:
            json.dump(item_reviews[split], f)

    print(f"[done] Wrote cache to: {out_dir.resolve()}")
    print("      Files:")
    print("       - embeddings.npy")
    print("       - sentences.jsonl")
    print("       - review2sent_ids.jsonl")
    print("       - splits/*/rows.jsonl, user_reviews.json, item_reviews.json")

In [None]:
TRAIN_CSV = SAVE_DATA_DIR / "train.csv"
VALID_CSV = SAVE_DATA_DIR / "valid.csv"
TEST_CSV  = SAVE_DATA_DIR / "test.csv"
OUT_DIR   = ST_CACHE_DIR


In [None]:
precompute_sent_embeddings(
    train_csv=str(TRAIN_CSV),
    valid_csv=str(VALID_CSV),
    test_csv=str(TEST_CSV),
    out_dir=str(OUT_DIR),
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    S=40,
    batch_size=1024,
    max_len=128,
    lowercase=True,
    local_model_dir=str(LOCAL_ST_MODEL) if LOCAL_ST_MODEL is not None else None,
)


In [None]:
# Cell 4: STCachedDataset (fixed rc/S) for fast numeric I/O training
import json, numpy as np, torch
from pathlib import Path
from torch.utils.data import Dataset

class STCachedDataset(Dataset):
    """
    Loads numeric cache produced by precompute_sent_embeddings (fixed rc/S view).
    - embeddings.npy  -> (N_sentences, H) float32 (memmap)
    - review2sent_ids.jsonl -> review_hash -> [S] sentence ids (-1 pad)
    - splits/{split}/rows.jsonl
    - splits/{split}/user_reviews.json, item_reviews.json
    """
    def __init__(self, cache_dir, split, rc=10, S=40):
        self.cache_dir = Path(cache_dir)
        self.split = split
        self.rc = rc
        self.S = S

        # embeddings
        self.emb = np.memmap(self.cache_dir / "embeddings.npy", dtype="float32", mode="r")
        # discover H
        N = self.emb.size
        with open(self.cache_dir / "sentences.jsonl", "r", encoding="utf-8") as f:
            n_sent = sum(1 for _ in f)
        H = N // n_sent
        self.emb = self.emb.reshape(n_sent, H)
        self.H = H

        # review -> sent_ids
        self.rev2ids = {}
        with open(self.cache_dir / "review2sent_ids.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                self.rev2ids[obj["review_hash"]] = obj["sent_ids"]

        # rows
        self.rows = []
        with open(self.cache_dir / "splits" / split / "rows.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                self.rows.append(json.loads(line))

        # groupings
        with open(self.cache_dir / "splits" / split / "user_reviews.json", "r", encoding="utf-8") as f:
            self.user_map = {int(k): v for k, v in json.load(f).items()}
        with open(self.cache_dir / "splits" / split / "item_reviews.json", "r", encoding="utf-8") as f:
            self.item_map = {int(k): v for k, v in json.load(f).items()}

    def _review_tensor(self, review_hashes):
        """
        Build (rc, S, H) from a list of review hashes.
        Takes the first rc; pads with zeros if fewer.
        """
        chosen = (review_hashes[:self.rc] +
                  ["<PAD>"] * max(0, self.rc - len(review_hashes)))
        out = np.zeros((self.rc, self.S, self.H), dtype=np.float32)
        for i, rh in enumerate(chosen):
            if rh == "<PAD>":
                continue
            ids = self.rev2ids.get(rh, [-1]*self.S)
            ids = ids[:self.S] if len(ids) >= self.S else ids + [-1]*(self.S-len(ids))
            valid_mask = np.array(ids) >= 0
            if valid_mask.any():
                out[i, valid_mask, :] = self.emb[np.array(ids)[valid_mask]]
        return torch.from_numpy(out)

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        row = self.rows[idx]
        u, it, rating, rh = row["userID"], row["itemID"], row["rating"], row["review_hash"]
        u_tensor = self._review_tensor(self.user_map.get(u, [rh]))
        i_tensor = self._review_tensor(self.item_map.get(it, [rh]))
        return u_tensor, i_tensor, torch.tensor([rating], dtype=torch.float32)

In [None]:
# Cell 5: quick test that cache & dataset load correctly
from torch.utils.data import DataLoader

cache_dir = OUT_DIR  # from Cell 3
train_ds = STCachedDataset(cache_dir, "train", rc=10, S=40)
dl = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=0)

batch = next(iter(dl))
u, i, r = batch
print("User batch:", tuple(u.shape))  # (B, rc, S, H)
print("Item batch:", tuple(i.shape))
print("Ratings  :", tuple(r.shape))

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class FactorizationMachine(nn.Module):
    def __init__(self, in_dim, k):
        super().__init__()
        self.linear = nn.Linear(in_dim, 1)
        self.V = nn.Parameter(torch.randn(in_dim, k) * 0.01)
    def forward(self, x):
        linear = self.linear(x)
        xv  = x @ self.V
        x2v2 = (x**2) @ (self.V**2)
        pairwise = 0.5 * (xv**2 - x2v2).sum(dim=1, keepdim=True)
        return linear + pairwise

class CNNOverSentences(nn.Module):
    def __init__(self, emb_dim, kernel_count=100, kernel_size=3, dropout=0.5, pool="max"):
        super().__init__()
        self.conv = nn.Conv1d(emb_dim, kernel_count, kernel_size, padding=(kernel_size-1)//2)
        self.act  = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.pool = pool
    def forward(self, x, mask=None):
        # x: (B*rc, S, H)
        z = self.conv(x.permute(0,2,1))     # (B*rc, K, S)
        z = self.act(z).transpose(1,2)      # (B*rc, S, K)
        if self.pool == "max":
            out = z.max(dim=1).values       # (B*rc, K)
        else:
            out = z.mean(dim=1)
        return self.drop(out)


class DeepCoNNCached(nn.Module):
    def __init__(self, emb_dim, rc=10, kernel_count=100, cnn_out_dim=50):
        super().__init__()
        self.rc = rc
        self.K = kernel_count
        self.cnn_u = CNNOverSentences(emb_dim, kernel_count)
        self.cnn_i = CNNOverSentences(emb_dim, kernel_count)
        self.proj_u = nn.Linear(rc * kernel_count, cnn_out_dim)
        self.proj_i = nn.Linear(rc * kernel_count, cnn_out_dim)
        self.fm     = FactorizationMachine(cnn_out_dim * 2, 10)
    def _encode_side(self, side_tensor, cnn):
        B, rc, S, H = side_tensor.shape
        flat = side_tensor.reshape(B*rc, S, H)
        k   = cnn(flat)                                # (B*rc, K)
        k   = k.reshape(B, rc, -1).reshape(B, -1)      # (B, rc*K)
        return k
    def forward(self, u, i):
        u = self._encode_side(u.float(), self.cnn_u)
        i = self._encode_side(i.float(), self.cnn_i)
        u = self.proj_u(u)
        i = self.proj_i(i)
        z = torch.cat([u, i], dim=1)
        return self.fm(z)

In [None]:
# Cell 7: Training loop with progress bars
from tqdm.auto import tqdm, trange
import time

def mse_to_rmse(m): return float(m)**0.5

def predict_mse(model, dataloader, device, desc="Eval"):
    mse, n = 0.0, 0
    model.eval()
    with torch.no_grad():
        for u,i,r in tqdm(dataloader, desc=desc, leave=False):
            u,i,r = u.to(device), i.to(device), r.to(device)
            preds = model(u,i)
            mse += F.mse_loss(preds, r, reduction="sum").item()
            n   += r.size(0)
    return mse / max(n,1)

def predict_mae(model, dataloader, device, desc="Eval"):
    mae, n = 0.0, 0
    model.eval()
    with torch.no_grad():
        for u,i,r in tqdm(dataloader, desc=desc, leave=False):
            u,i,r = u.to(device), i.to(device), r.to(device)
            preds = model(u,i)
            mae += F.l1_loss(preds, r, reduction="sum").item()
            n   += r.size(0)
    return mae / max(n,1)

def train_loop(train_dl, valid_dl, model, device, epochs=5, lr=2e-3, patience=2, model_path="best.pt"):
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-6)
    best_loss = float("inf"); bad_epochs = 0
    for epoch in trange(epochs, desc="Epochs"):
        model.train()
        total_loss, total_samples = 0.0, 0
        pbar = tqdm(train_dl, desc=f"Train {epoch}", leave=False)
        for u,i,r in pbar:
            u,i,r = u.to(device), i.to(device), r.to(device)
            preds = model(u,i)
            loss  = F.mse_loss(preds, r, reduction="sum")
            opt.zero_grad(); loss.backward(); opt.step()
            total_loss   += loss.item()
            total_samples+= r.size(0)
            running = total_loss/max(total_samples,1)
            pbar.set_postfix(MSE=f"{running:.4f}",RMSE=f"{mse_to_rmse(running):.4f}")
        # validation
        valid_mse = predict_mse(model, valid_dl, device, desc="Valid")
        print(f"Epoch {epoch:02d} | Train RMSE {mse_to_rmse(total_loss/max(total_samples,1)):.4f} "
              f"| Valid RMSE {mse_to_rmse(valid_mse):.4f}")
        if valid_mse < best_loss:
            best_loss = valid_mse
            torch.save(model.state_dict(), model_path)
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping.")
                break
    print(f"Best valid RMSE: {mse_to_rmse(best_loss):.4f}")


In [None]:
# Cell 8: train and test with cached dataset
from torch.utils.data import DataLoader

cache_dir = OUT_DIR  # from earlier
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ds = STCachedDataset(cache_dir, "train", rc=10, S=40)
valid_ds = STCachedDataset(cache_dir, "valid", rc=10, S=40)
test_ds  = STCachedDataset(cache_dir, "test",  rc=10, S=40)

pin = torch.cuda.is_available()
train_dl = DataLoader(train_ds, batch_size=128, shuffle=True,  pin_memory=pin, num_workers=0)
valid_dl = DataLoader(valid_ds, batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)
test_dl  = DataLoader(test_ds,  batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)

model = DeepCoNNCached(emb_dim=train_ds.H, rc=10).to(device)
best_path = str(Path(cache_dir)/"best_model.pt")

train_loop(train_dl, valid_dl, model, device, epochs=5, model_path=best_path)

# load best and test
best = DeepCoNNCached(emb_dim=train_ds.H, rc=10).to(device)
best.load_state_dict(torch.load(best_path, map_location=device))



In [None]:
train_mse = predict_mse(best, train_dl, device, desc="Train")
val_mse   = predict_mse(best, valid_dl, device, desc="Valid")
val_mae   = predict_mae(best, valid_dl, device, desc="Valid")
test_mse  = predict_mse(best, test_dl,  device, desc="Test")
test_mae  = predict_mae(best, test_dl,  device, desc="Test")

print(f"Train  RMSE={mse_to_rmse(train_mse):.4f}")
print(f"Valid  RMSE={mse_to_rmse(val_mse):.4f}, MAE={val_mae:.4f}")
print(f"Test   RMSE={mse_to_rmse(test_mse):.4f}, MAE={test_mae:.4f}")

In [None]:
import os

# Base working directory (choose whatever makes sense on your machine)
WORK_DIR = os.path.abspath("./working")

# Subdirectories
CACHE_DIR = os.path.join(WORK_DIR, "st_cache")
FIG_DIR   = os.path.join(WORK_DIR, "fig")

# Create folders if they don't exist
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)

In [None]:
# Label mapping
LABEL_NAMES = ["neg", "neutral", "pos"]

@torch.no_grad()
def ratings_to_classes(r: torch.Tensor) -> torch.Tensor:
    """
    r: (B, 1) float tensor of ratings in [1,5]
    returns: (B,) long tensor with classes: 0=neg,1=neutral,2=pos
    """
    x = r.squeeze(1)
    classes = torch.where(
        x <= 2.0, torch.tensor(0, device=x.device),
        torch.where(x >= 4.0, torch.tensor(2, device=x.device), torch.tensor(1, device=x.device))
    )
    return classes.long()



print("Embedding dim (H):", train_ds.H)  # should be 300 for GloVe 300d


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score

class CNNOverSentences(nn.Module):
    def __init__(self, emb_dim, kernel_count=100, kernel_size=3, dropout=0.3, pool="max"):
        super().__init__()
        self.conv = nn.Conv1d(emb_dim, kernel_count, kernel_size, padding=(kernel_size-1)//2)
        self.act  = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.pool = pool
    def forward(self, x):
        # x: (B*rc, S, H)
        z = self.conv(x.permute(0,2,1))     # (B*rc, K, S)
        z = self.act(z).transpose(1,2)      # (B*rc, S, K)
        out = z.max(dim=1).values if self.pool == "max" else z.mean(dim=1)
        return self.drop(out)

class DeepCoNNCachedClassifier(nn.Module):
    def __init__(self, emb_dim, rc=10, kernel_count=100, cnn_out_dim=50, num_classes=3, dropout=0.3):
        super().__init__()
        self.rc = rc
        self.cnn_u = CNNOverSentences(emb_dim, kernel_count, dropout=dropout)
        self.cnn_i = CNNOverSentences(emb_dim, kernel_count, dropout=dropout)
        self.proj_u = nn.Linear(rc * kernel_count, cnn_out_dim)
        self.proj_i = nn.Linear(rc * kernel_count, cnn_out_dim)
        self.drop   = nn.Dropout(dropout)
        # tiny MLP head (a bit more capacity than a single Linear)
        self.head   = nn.Sequential(
            nn.Linear(cnn_out_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes),
        )

    def _encode_side(self, side_tensor, cnn):
        B, rc, S, H = side_tensor.shape
        flat = side_tensor.reshape(B*rc, S, H)
        k   = cnn(flat)                           # (B*rc, K)
        k   = k.reshape(B, rc, -1).reshape(B, -1) # (B, rc*K)
        return k

    def forward(self, u, i):
        u = self._encode_side(u.float(), self.cnn_u)
        i = self._encode_side(i.float(), self.cnn_i)
        u = self.proj_u(u)
        i = self.proj_i(i)
        z = torch.cat([u, i], dim=1)
        z = self.drop(z)
        return self.head(z)  # logits (B,3)

def compute_class_weights(train_dl, device, num_classes=3):
    counts = torch.zeros(num_classes, dtype=torch.float64)
    for u,i,r in tqdm(train_dl, desc="Class weight scan", leave=False):
        y = ratings_to_classes(r).cpu()
        counts += torch.bincount(y, minlength=num_classes).to(torch.float64)
    # inverse-frequency weights
    weights = counts.sum() / (counts + 1e-8)
    return weights.to(device).to(torch.float32)

def evaluate_acc(model, dl, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for u,i,r in dl:
            logits = model(u.to(device), i.to(device))
            y_pred.append(logits.argmax(dim=1).cpu())
            y_true.append(ratings_to_classes(r).cpu())
    y_true = torch.cat(y_true).numpy()
    y_pred = torch.cat(y_pred).numpy()
    return float(accuracy_score(y_true, y_pred))

# ----- train
emb_dim = train_ds.H
model = DeepCoNNCachedClassifier(emb_dim=emb_dim, rc=10, kernel_count=100, cnn_out_dim=50, num_classes=3, dropout=0.3).to(device)
opt = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=1e-6)
w = compute_class_weights(train_dl, device)
print("Class weights:", w.detach().cpu().numpy().round(3))
criterion = nn.CrossEntropyLoss(weight=w)

best_acc, bad_epochs, patience = -1.0, 0, 3
best_path = os.path.join(CACHE_DIR, "best_cls.pt")

for ep in trange(5, desc="Epochs"):
    model.train()
    running, nb = 0.0, 0
    for u,i,r in tqdm(train_dl, desc=f"Train {ep}", leave=False):
        u,i = u.to(device), i.to(device)
        y   = ratings_to_classes(r).to(device)
        logits = model(u,i)
        loss = criterion(logits, y)
        opt.zero_grad(); loss.backward(); opt.step()
        running += float(loss.item()); nb += 1

    val_acc = evaluate_acc(model, valid_dl, device)
    print(f"Epoch {ep:02d} | Train loss {running/max(nb,1):.4f} | Valid Acc {val_acc:.4f}")
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), best_path)
        bad_epochs = 0
    else:
        bad_epochs += 1
        if bad_epochs >= patience:
            print("Early stopping.")
            break

print(f"Best valid accuracy: {best_acc:.4f}")

# load best for testing
best = DeepCoNNCachedClassifier(emb_dim=emb_dim, rc=10, kernel_count=100, cnn_out_dim=50, num_classes=3, dropout=0.0).to(device)
best.load_state_dict(torch.load(best_path, map_location=device))
best.eval()


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
from IPython.display import Image, display

# Collect predictions
y_true, y_pred = [], []
with torch.no_grad():
    for u,i,r in test_dl:
        logits = best(u.to(device), i.to(device))
        y_pred.append(logits.argmax(dim=1).cpu())
        y_true.append(ratings_to_classes(r).cpu())

y_true = torch.cat(y_true).numpy()
y_pred = torch.cat(y_pred).numpy()

# Metrics
acc = accuracy_score(y_true, y_pred)
cm  = confusion_matrix(y_true, y_pred, labels=[0,1,2])
rep = classification_report(y_true, y_pred, target_names=LABEL_NAMES, digits=4)

print(f"Test accuracy: {acc:.4f}")
print(rep)

# Save report to file (Kaggle will keep it in /kaggle/working/fig)
rep_path = os.path.join(FIG_DIR, "sentiment_classification_report.txt")
with open(rep_path, "w", encoding="utf-8") as f:
    f.write(f"Accuracy: {acc:.4f}\n\n{rep}\n")
print("Classification report saved →", rep_path)

# Plot & save confusion matrix (single-axes, default matplotlib colours)
fig, ax = plt.subplots(figsize=(5.5, 5.0))
im = ax.imshow(cm)
ax.set_title("Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("True")
ax.set_xticks(np.arange(len(LABEL_NAMES)))
ax.set_yticks(np.arange(len(LABEL_NAMES)))
ax.set_xticklabels(LABEL_NAMES)
ax.set_yticklabels(LABEL_NAMES)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, str(cm[i, j]), ha="center", va="center")
fig.tight_layout()
cm_path = os.path.join(FIG_DIR, "sentiment_confusion_matrix.png")
fig.savefig(cm_path, dpi=150)
plt.close(fig)
print("Confusion matrix saved →", cm_path)

# Display inside Kaggle output pane
display(Image(filename=cm_path))
