In [None]:
# Standard libs
import os, sys, time, math, json, inspect, argparse, random, re
from pathlib import Path

# PyData / ML
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Tokenization (no extra downloads required)
from nltk.tokenize import WordPunctTokenizer

# Hugging Face datasets
try:
    from datasets import load_dataset
except Exception as e:
    raise RuntimeError(
        "Please install 'datasets' first, e.g. `pip install datasets`"
    ) from e

# ---- Reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(42)


In [None]:
print("CUDA available:", torch.cuda.is_available())
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from pathlib import Path

IN_KAGGLE = Path("/kaggle").exists()

if IN_KAGGLE:
    INPUT_ROOT = Path("/kaggle/input")
    WORK_ROOT  = Path("/kaggle/working")

    STOPWORDS = INPUT_ROOT / "digital-music-5" / "stopwords.txt"
    PUNCTS    = INPUT_ROOT / "digital-music-5" / "punctuations.txt"
    GLOVE     = INPUT_ROOT / "glove6b100dtxt" / "glove.6B.100d.txt"

    RAW_ALL_BEAUTY = INPUT_ROOT / "all-beauty" / "All_Beauty.jsonl"
    LOCAL_ST_MODEL = INPUT_ROOT / "minilm-l6-v2-local" / "all-MiniLM-L6-v2"
else:
    PROJECT_ROOT = Path.cwd()

    # if the folders are directly under the project root:
    RAW_ALL_BEAUTY = PROJECT_ROOT / "All_Beauty" / "All_Beauty.jsonl"
    GLOVE          = PROJECT_ROOT / "glove.6B.100d.txt" / "glove.6B.100d.txt"
    STOPWORDS      = PROJECT_ROOT / "Digital_Music_5" / "stopwords.txt"
    PUNCTS         = PROJECT_ROOT / "Digital_Music_5" / "punctuations.txt"
    # pointing to the directory that contains config.json, tokenizer.json, etc.
    LOCAL_ST_MODEL = PROJECT_ROOT / "all-MiniLM-L6-v2" / "all-MiniLM-L6-v2"

    ARTIFACTS = PROJECT_ROOT / "artifacts"

# Output directories as Path
WORK_DIR      = WORK_ROOT if IN_KAGGLE else ARTIFACTS
SAVE_DATA_DIR = WORK_DIR / "Amazon_Fashion"
MODEL_DIR     = WORK_DIR / "model"
FIG_DIR       = WORK_DIR / "fig"
ST_CACHE_DIR  = WORK_DIR / "st_cache"

# Create directories
for d in [SAVE_DATA_DIR, MODEL_DIR, FIG_DIR, ST_CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)


In [None]:
for p in [RAW_ALL_BEAUTY, GLOVE, STOPWORDS, PUNCTS, LOCAL_ST_MODEL]:
    print(p, "exists:", p.exists())

In [None]:
import os
import pandas as pd
from datasets import load_dataset  # optional fallback
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer


def load_all_beauty_local(jsonl_path: str | Path | None = None):
    """
    Load All_Beauty.jsonl (Kaggle or local) and return a pandas DataFrame
    with standardized columns: userID, itemID, review, rating.
    """
    if jsonl_path is None:
        jsonl_path = RAW_ALL_BEAUTY
    jsonl_path = Path(jsonl_path)

    needed = ("user_id", "asin", "text", "rating")

    def _standardize_cols(df):
        alt_map = {
            "reviewText": "text",
            "overall": "rating",
            "user": "user_id",
            "item": "asin",
        }
        for old, new in alt_map.items():
            if old in df.columns and new not in df.columns:
                df[new] = df[old]

        missing = [c for c in needed if c not in df.columns]
        if missing:
            raise KeyError(
                f"Missing required columns {missing}. "
                "Make sure your JSONL has keys like: user_id, asin, text, rating."
            )

        df = df[list(needed)].copy()
        df.columns = ["userID", "itemID", "review", "rating"]
        df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
        df = df[df["rating"].notnull()]
        df = df[df["review"].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]
        df.reset_index(drop=True, inplace=True)
        return df



    try:
        df = pd.read_json(jsonl_path, lines=True)
        return _standardize_cols(df)
    except Exception as e_pd:
        try:
            ds = load_dataset("json", data_files=str(jsonl_path), split="train")
            df = ds.to_pandas()
            return _standardize_cols(df)
        except Exception as e_hf:
            raise RuntimeError(
                f"Failed to load JSONL via pandas ({type(e_pd).__name__}: {e_pd}) "
                f"and datasets ({type(e_hf).__name__}: {e_hf})."
            )


In [None]:
# --- keep your helpers as-is ---
def _read_list(path):
    if not os.path.exists(path):
        raise FileNotFoundError(
            f"Required file not found: {path}\n"
            "Place a plain-text file with one token per line."
        )
    with open(path, encoding="utf-8") as f:
        return set(ln.strip() for ln in f if ln.strip())

def process_df_to_csv(df, stopwords_path, puncts_path, train_rate, csv_path):
    # Map IDs to contiguous integers
    df["userID"] = df["userID"].astype("category").cat.codes
    df["itemID"] = df["itemID"].astype("category").cat.codes

    # Load stopwords/punctuations
    stop_words   = _read_list(stopwords_path)
    punctuations = _read_list(puncts_path)
    tok = WordPunctTokenizer()

    def clean_review(review: str) -> str:
        rv = review.lower()
        for p in punctuations:
            rv = rv.replace(p, " ")
        toks = tok.tokenize(rv)
        toks = [w for w in toks if w not in stop_words]
        return " ".join(toks)

    print("#### Cleaning text (this can take a while on large splits)...")
    df["review"] = df["review"].apply(clean_review)

    # Train/valid/test split
    train_df, valid_test_df = train_test_split(df, test_size=1 - train_rate, random_state=3)
    valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=4)

    os.makedirs(csv_path, exist_ok=True)
    train_df.to_csv(os.path.join(csv_path, "train.csv"), index=False, header=False)
    valid_df.to_csv(os.path.join(csv_path, "valid.csv"), index=False, header=False)
    test_df .to_csv(os.path.join(csv_path, "test.csv"),  index=False, header=False)

    print(f"#### Saved CSVs to {csv_path}")
    print(f"#### Split sizes: train {len(train_df)}, valid {len(valid_df)}, test {len(test_df)}")
    print(f"#### Totals: {len(df)} reviews, {df['userID'].nunique()} users, {df['itemID'].nunique()} items.")
    return train_df, valid_df, test_df

In [None]:
csv_train = SAVE_DATA_DIR / "train.csv"

if not csv_train.exists():
    df_raw = load_all_beauty_local()  # uses Kaggle or local automatically
    _ = process_df_to_csv(
        df_raw,
        stopwords_path=STOPWORDS,
        puncts_path=PUNCTS,
        train_rate=0.8,
        csv_path=SAVE_DATA_DIR,
    )
else:
    print("CSV files already exist — skipping reprocessing.")


In [None]:
def now(f='%Y-%m-%d %H:%M:%S'):
    return time.strftime(f, time.localtime())

class Config:
    # Device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Training
    train_epochs        = 5
    batch_size          = 128
    learning_rate       = 2e-3
    l2_regularization   = 1e-6
    learning_rate_decay = 0.99
    patience            = 3

    # Files
    word2vec_file = GLOVE
    train_file    = os.path.join(SAVE_DATA_DIR, 'train.csv')
    valid_file    = os.path.join(SAVE_DATA_DIR, 'valid.csv')
    test_file     = os.path.join(SAVE_DATA_DIR, 'test.csv')
    model_file    = os.path.join(MODEL_DIR, 'best_model.pt')

    # Data shaping
    review_count         = 10    # number of reviews per side
    review_length        = 40    # tokens per review
    lowest_review_count  = 2
    PAD_WORD             = '<UNK>'

    # Model sizes
    kernel_count = 100
    kernel_size  = 3
    dropout_prob = 0.5
    cnn_out_dim  = 50

    def __init__(self):
        # Allow CLI/nb override (no-op by default)
        attributes = inspect.getmembers(self, lambda a: not inspect.isfunction(a))
        attributes = list(filter(lambda x: not x[0].startswith('__'), attributes))
        parser = argparse.ArgumentParser(add_help=False)
        for key, val in attributes:
            parser.add_argument('--' + key, dest=key, type=type(val), default=val)
        args, _ = parser.parse_known_args([])
        for key, val in args.__dict__.items():
            setattr(self, key, val)

    def __str__(self):
        attributes = inspect.getmembers(self, lambda a: not inspect.isfunction(a))
        attributes = list(filter(lambda x: not x[0].startswith('__'), attributes))
        return "\n".join([f"{k} = {v}" for k, v in attributes])

In [None]:
# Cell 1: imports & small utils
import os, json, re, csv, hashlib
from pathlib import Path
from collections import defaultdict

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def simple_sentence_split(text: str, max_sentences: int):
    """Split on . ! ? and keep up to max_sentences."""
    if not isinstance(text, str) or not text.strip():
        return []
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    sents = [s.strip() for s in parts if s.strip()]
    return sents[:max_sentences]

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()


In [None]:
def read_split(csv_path: str):
    """Reads CSV with no header: userID,itemID,review,rating"""
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        rdr = csv.reader(f)
        for user_id, item_id, review, rating in rdr:
            rows.append((int(user_id), int(item_id), review, float(rating)))
    return rows

In [None]:
def load_embedding(word2vec_file):
    """
    Loads GloVe text file into:
    - word_emb: np.ndarray shape (V, D), row 0 is <UNK> = 0-vector
    - word_dict: {token -> idx}
    """
    if not os.path.exists(word2vec_file):
        raise FileNotFoundError(
            f"GloVe file not found at {word2vec_file}.\n"
            "Place 'glove.6B.100d.txt' under assets/ or Kaggle input."
        )
    word_emb, word_dict = [], {}
    word_emb.append([0.0])           # temp row 0; fix dim after first vec
    word_dict['<UNK>'] = 0
    with open(word2vec_file, encoding='utf-8') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            toks = line.split(' ')
            word, vec = toks[0], toks[1:]
            if not vec:
                continue
            vec = [float(x) for x in vec]
            if i == 0:
                word_emb[0] = [0.0] * len(vec)  # set UNK dim
            word_emb.append(vec)
            word_dict[word] = len(word_dict)
    return np.array(word_emb, dtype=np.float32), word_dict


In [None]:
from pathlib import Path
import json, csv, re, hashlib
import numpy as np
import torch
from nltk.tokenize import WordPunctTokenizer

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def simple_sentence_split(text: str, max_sentences: int):
    if not isinstance(text, str) or not text.strip():
        return []
    parts = re.split(r'(?<=[.!?])\s+', text.strip())
    sents = [s.strip() for s in parts if s.strip()]
    return sents[:max_sentences]

def read_split(csv_path: str):
    rows = []
    with open(csv_path, "r", encoding="utf-8") as f:
        rdr = csv.reader(f)
        for user_id, item_id, review, rating in rdr:
            rows.append((int(user_id), int(item_id), review, float(rating)))
    return rows

def precompute_sent_embeddings_glove(
    train_csv: str,
    valid_csv: str,
    test_csv: str,
    out_dir: str,
    word2vec_file: str,
    S: int = 40,
    lowercase: bool = True,
    batch_write: int = 5000,  # how many sentences to aggregate before writing (memmap is fine, but this keeps RAM low)
):
    """
    GloVe version of precompute:
      - Build unique sentence inventory (cap S per review)
      - Embed each sentence by mean of GloVe word vectors (UNK=0 ignored if no known tokens)
      - Write the same cache layout your STCachedDataset expects
    """
    out_dir = Path(out_dir)
    ensure_dir(out_dir)
    splits_dir = out_dir / "splits"
    ensure_dir(splits_dir)

    # 1) Read all splits
    split_paths = {"train": Path(train_csv), "valid": Path(valid_csv), "test": Path(test_csv)}
    splits = {k: read_split(str(v)) for k, v in split_paths.items()}

    # 2) Collect unique reviews + per-split manifests
    all_reviews = {}          # review_hash -> raw_review_text
    split_rows = {}           # split -> list[dict]
    user_reviews = {}         # split -> {userID: [review_hash,...]}
    item_reviews = {}         # split -> {itemID: [review_hash,...]}

    for split, rows in splits.items():
        s_rows = []
        u_map = {}
        i_map = {}
        for (u, it, review, rating) in rows:
            rtxt = review.lower() if lowercase else review
            r_hash = sha1(rtxt)
            if r_hash not in all_reviews:
                all_reviews[r_hash] = rtxt
            s_rows.append({"userID": u, "itemID": it, "rating": rating, "review_hash": r_hash})
            u_map.setdefault(u, []).append(r_hash)
            i_map.setdefault(it, []).append(r_hash)
        split_rows[split] = s_rows
        user_reviews[split] = {str(k): v for k, v in u_map.items()}
        item_reviews[split] = {str(k): v for k, v in i_map.items()}

    # 3) Build deduped sentence inventory
    sentence_to_id = {}
    sentences = []  # index -> text
    review_to_sentids = {}  # review_hash -> [S] sentence ids (pad with -1)

    for r_hash, rtxt in all_reviews.items():
        sents = simple_sentence_split(rtxt, S)
        ids = []
        for s in sents:
            if s not in sentence_to_id:
                sentence_to_id[s] = len(sentences)
                sentences.append(s)
            ids.append(sentence_to_id[s])
        ids = (ids + [-1]*(S - len(ids))) if len(ids) < S else ids[:S]
        review_to_sentids[r_hash] = ids

    print(f"[build] Unique reviews: {len(all_reviews):,}")
    print(f"[build] Unique sentences: {len(sentences):,}")

    # 4) Load GloVe
    word_emb, word_dict = load_embedding(word2vec_file)
    D = int(word_emb.shape[1])
    print(f"[glove] Vocab: {word_emb.shape[0]:,}, Dim: {D}")

    # 5) Encode all unique sentences (mean of token vectors)
    tok = WordPunctTokenizer()
    N = len(sentences)
    EMB = np.memmap(out_dir / "embeddings.npy", dtype="float32", mode="w+", shape=(N, D))

    def sent_vec(text: str):
        tokens = tok.tokenize(text.lower())
        idxs = [word_dict.get(w, 0) for w in tokens]  # 0 is <UNK>
        if not idxs:
            return np.zeros((D,), dtype=np.float32)
        # exclude UNK rows to avoid biasing means when most words are unseen
        real = [i for i in idxs if i != 0]
        if not real:
            return np.zeros((D,), dtype=np.float32)
        vecs = word_emb[np.array(real)]
        return vecs.mean(axis=0).astype(np.float32)

    # Write in chunks (optional; memmap can handle one-by-one too)
    buf = []
    starts = []
    for i, s in enumerate(sentences):
        buf.append(sent_vec(s))
        if len(buf) >= batch_write:
            start = i + 1 - len(buf)
            EMB[start:start+len(buf), :] = np.stack(buf, axis=0)
            buf.clear()
    if buf:
        start = N - len(buf)
        EMB[start: start+len(buf), :] = np.stack(buf, axis=0)
        buf.clear()
    del EMB
    print("[encode] embeddings.npy written.")

    # 6) Metadata files
    with open(out_dir / "sentences.jsonl", "w", encoding="utf-8") as f:
        for sid, txt in enumerate(sentences):
            f.write(json.dumps({"sent_id": sid, "text": txt}, ensure_ascii=False) + "\n")

    with open(out_dir / "review2sent_ids.jsonl", "w", encoding="utf-8") as f:
        for r_hash, ids in review_to_sentids.items():
            f.write(json.dumps({"review_hash": r_hash, "sent_ids": ids}) + "\n")

    # Per-split artifacts
    for split in ["train", "valid", "test"]:
        sp = splits_dir / split
        ensure_dir(sp)
        with open(sp / "rows.jsonl", "w", encoding="utf-8") as f:
            for row in split_rows[split]:
                f.write(json.dumps(row) + "\n")
        with open(sp / "user_reviews.json", "w", encoding="utf-8") as f:
            json.dump(user_reviews[split], f)
        with open(sp / "item_reviews.json", "w", encoding="utf-8") as f:
            json.dump(item_reviews[split], f)

    print(f"[done] Wrote cache to: {out_dir.resolve()}")
    print("      Files:")
    print("       - embeddings.npy  (shape: ", (N, D), ")")
    print("       - sentences.jsonl")
    print("       - review2sent_ids.jsonl")
    print("       - splits/*/rows.jsonl, user_reviews.json, item_reviews.json")


In [None]:
# Cell 3: run precompute — EDIT THESE PATHS
TRAIN_CSV = SAVE_DATA_DIR / "train.csv"
VALID_CSV = SAVE_DATA_DIR / "valid.csv"
TEST_CSV  = SAVE_DATA_DIR / "test.csv"
OUT_DIR   = ST_CACHE_DIR

precompute_sent_embeddings_glove(
    train_csv=TRAIN_CSV,
    valid_csv=VALID_CSV,
    test_csv=TEST_CSV,
    out_dir=OUT_DIR,
    word2vec_file=GLOVE,   # e.g. ".../glove.6B.300d.txt"
    S=40,
    lowercase=True,
)


In [None]:
# Cell 4: STCachedDataset (fixed rc/S) for fast numeric I/O training
import json, numpy as np, torch
from pathlib import Path
from torch.utils.data import Dataset

class STCachedDataset(Dataset):
    """
    Loads numeric cache produced by precompute_sent_embeddings (fixed rc/S view).
    - embeddings.npy  -> (N_sentences, H) float32 (memmap)
    - review2sent_ids.jsonl -> review_hash -> [S] sentence ids (-1 pad)
    - splits/{split}/rows.jsonl
    - splits/{split}/user_reviews.json, item_reviews.json
    """
    def __init__(self, cache_dir, split, rc=10, S=40):
        self.cache_dir = Path(cache_dir)
        self.split = split
        self.rc = rc
        self.S = S

        # embeddings
        self.emb = np.memmap(self.cache_dir / "embeddings.npy", dtype="float32", mode="r")
        # discover H
        N = self.emb.size
        with open(self.cache_dir / "sentences.jsonl", "r", encoding="utf-8") as f:
            n_sent = sum(1 for _ in f)
        H = N // n_sent
        self.emb = self.emb.reshape(n_sent, H)
        self.H = H

        # review -> sent_ids
        self.rev2ids = {}
        with open(self.cache_dir / "review2sent_ids.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                self.rev2ids[obj["review_hash"]] = obj["sent_ids"]

        # rows
        self.rows = []
        with open(self.cache_dir / "splits" / split / "rows.jsonl", "r", encoding="utf-8") as f:
            for line in f:
                self.rows.append(json.loads(line))

        # groupings
        with open(self.cache_dir / "splits" / split / "user_reviews.json", "r", encoding="utf-8") as f:
            self.user_map = {int(k): v for k, v in json.load(f).items()}
        with open(self.cache_dir / "splits" / split / "item_reviews.json", "r", encoding="utf-8") as f:
            self.item_map = {int(k): v for k, v in json.load(f).items()}

    def _review_tensor(self, review_hashes):
        """
        Build (rc, S, H) from a list of review hashes.
        Takes the first rc; pads with zeros if fewer.
        """
        chosen = (review_hashes[:self.rc] +
                  ["<PAD>"] * max(0, self.rc - len(review_hashes)))
        out = np.zeros((self.rc, self.S, self.H), dtype=np.float32)
        for i, rh in enumerate(chosen):
            if rh == "<PAD>":
                continue
            ids = self.rev2ids.get(rh, [-1]*self.S)
            ids = ids[:self.S] if len(ids) >= self.S else ids + [-1]*(self.S-len(ids))
            valid_mask = np.array(ids) >= 0
            if valid_mask.any():
                out[i, valid_mask, :] = self.emb[np.array(ids)[valid_mask]]
        return torch.from_numpy(out)

    def __len__(self):
        return len(self.rows)

    def __getitem__(self, idx):
        row = self.rows[idx]
        u, it, rating, rh = row["userID"], row["itemID"], row["rating"], row["review_hash"]
        u_tensor = self._review_tensor(self.user_map.get(u, [rh]))
        i_tensor = self._review_tensor(self.item_map.get(it, [rh]))
        return u_tensor, i_tensor, torch.tensor([rating], dtype=torch.float32)


In [None]:
# Cell 5: quick test that cache & dataset load correctly
from torch.utils.data import DataLoader

cache_dir = OUT_DIR  # from Cell 3
train_ds = STCachedDataset(cache_dir, "train", rc=10, S=40)
dl = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=0)

batch = next(iter(dl))
u, i, r = batch
print("User batch:", tuple(u.shape))  # (B, rc, S, H)
print("Item batch:", tuple(i.shape))
print("Ratings  :", tuple(r.shape))


In [None]:
import torch.nn as nn
import torch.nn.functional as F

class FactorizationMachine(nn.Module):
    def __init__(self, in_dim, k):
        super().__init__()
        self.linear = nn.Linear(in_dim, 1)
        self.V = nn.Parameter(torch.randn(in_dim, k) * 0.01)
    def forward(self, x):
        linear = self.linear(x)
        xv  = x @ self.V
        x2v2 = (x**2) @ (self.V**2)
        pairwise = 0.5 * (xv**2 - x2v2).sum(dim=1, keepdim=True)
        return linear + pairwise

In [None]:
from transformers import AutoModel, AutoConfig, BertConfig, BertModel

class HFSeqEncoder(nn.Module):
    """
    Tries to load a pretrained HF backbone locally (no networking).
    If unavailable, falls back to a randomly-initialised tiny BERT.
    """
    def __init__(self, glove_dim: int,
                 backbone: str = "prajjwal1/bert-tiny",
                 dropout: float = 0.1,
                 freeze_backbone: bool = False,
                 offline_only: bool = True,   # key flag: avoid network calls
                 fallback_hidden: int = 128,  # used if we build random tiny BERT
                 fallback_layers: int = 2,
                 fallback_heads: int = 2):
        super().__init__()

        # Encourage transformers to avoid network calls
        os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")

        loaded = False
        try:
            # Try to use a locally cached pretrained model (no internet)
            self.config = AutoConfig.from_pretrained(backbone, local_files_only=offline_only)
            self.backbone = AutoModel.from_pretrained(backbone, local_files_only=offline_only)
            loaded = True
        except Exception as e:
            print(f"[HFSeqEncoder] Could not load '{backbone}' locally: {type(e).__name__}: {e}")
            print("[HFSeqEncoder] Falling back to a randomly initialised tiny BERT.")
            # Build a small BERT from scratch (random init) so shapes are sane
            self.config = BertConfig(
                hidden_size=fallback_hidden,
                num_hidden_layers=fallback_layers,
                num_attention_heads=fallback_heads,
                intermediate_size=fallback_hidden * 4,
                max_position_embeddings=512,
                vocab_size=30522,
                hidden_dropout_prob=dropout,
                attention_probs_dropout_prob=dropout,
            )
            self.backbone = BertModel(self.config)

        if freeze_backbone and loaded:
            for p in self.backbone.parameters():
                p.requires_grad = False

        self.in_proj = nn.Linear(glove_dim, self.config.hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):  # x: (B*rc, S, H_glove)
        # Build attention mask: zero-rows are padding
        attn_mask = (x.abs().sum(dim=-1) > 0)  # bool mask (B*rc, S)

        x = self.in_proj(x)  # (B*rc, S, hidden)
        out = self.backbone(inputs_embeds=x, attention_mask=attn_mask).last_hidden_state  # (B*rc, S, hidden)

        # masked mean over S
        mask = attn_mask.unsqueeze(-1)                        # (B*rc, S, 1)
        summed = (out * mask).sum(dim=1)                      # (B*rc, hidden)
        denom = mask.sum(dim=1).clamp(min=1).float()          # (B*rc, 1)
        pooled = summed / denom
        return self.dropout(pooled)


In [None]:
# ----- Drop this cell before the training loop -----
import torch
import torch.nn as nn
import torch.nn.functional as F

class DeepCoNNTransformer(nn.Module):
    """
    GloVe -> projection -> (tiny) BERT encoder per review-sentence block,
    pooled across sentences and across rc reviews, then MLP to a scalar.
    """
    def __init__(
        self,
        glove_dim: int,          # must equal train_ds.H (e.g., 100 for glove.6B.100d)
        rc: int = 10,            # number of reviews per side (matches STCachedDataset)
        backbone: str = "prajjwal1/bert-tiny",
        proj_dim: int = 128,     # hidden size to use if we build our own tiny BERT
        dropout: float = 0.2,
        freeze_backbone: bool = False,
        offline_only: bool = True
    ):
        super().__init__()
        self.rc = rc

        # Two identical encoders (weights NOT shared by default in DeepCoNN literature).
        # If you want to share, reuse the same module instance for both.
        self.user_enc = HFSeqEncoder(
            glove_dim=glove_dim,
            backbone=backbone,
            dropout=dropout,
            freeze_backbone=freeze_backbone,
            offline_only=offline_only,
            fallback_hidden=proj_dim,
        )
        self.item_enc = HFSeqEncoder(
            glove_dim=glove_dim,
            backbone=backbone,
            dropout=dropout,
            freeze_backbone=freeze_backbone,
            offline_only=offline_only,
            fallback_hidden=proj_dim,
        )

        hidden = self.user_enc.config.hidden_size  # from the backbone (or fallback config)
        self.out = nn.Sequential(
            nn.Linear(hidden * 2, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )

    def _encode_stack(self, x):
        """
        x: (B, rc, S, H_glove)
        returns pooled: (B, hidden) by encoding each review then averaging over rc
        """
        B, rc, S, H = x.shape
        x = x.view(B * rc, S, H)              # (B*rc, S, H)
        rep = self.user_enc(x) if self._route == "user" else self.item_enc(x)  # (B*rc, hidden)
        rep = rep.view(B, rc, -1).mean(dim=1) # (B, hidden), mean over rc
        return rep

    def forward(self, u, i):
        """
        u, i: (B, rc, S, H_glove) float tensors from STCachedDataset
        returns: (B, 1) predicted ratings
        """
        # encode users
        self._route = "user"
        u_rep = self._encode_stack(u)
        # encode items
        self._route = "item"
        i_rep = self._encode_stack(i)
        # concat and predict
        x = torch.cat([u_rep, i_rep], dim=-1)  # (B, 2*hidden)
        return self.out(x)


In [None]:
# Cell 7: Training loop with progress bars
from tqdm.auto import tqdm, trange
import time

def mse_to_rmse(m): return float(m)**0.5

def predict_mse(model, dataloader, device, desc="Eval"):
    mse, n = 0.0, 0
    model.eval()
    with torch.no_grad():
        for u,i,r in tqdm(dataloader, desc=desc, leave=False):
            u,i,r = u.to(device), i.to(device), r.to(device)
            preds = model(u,i)
            mse += F.mse_loss(preds, r, reduction="sum").item()
            n   += r.size(0)
    return mse / max(n,1)

def predict_mae(model, dataloader, device, desc="Eval"):
    mae, n = 0.0, 0
    model.eval()
    with torch.no_grad():
        for u,i,r in tqdm(dataloader, desc=desc, leave=False):
            u,i,r = u.to(device), i.to(device), r.to(device)
            preds = model(u,i)
            mae += F.l1_loss(preds, r, reduction="sum").item()
            n   += r.size(0)
    return mae / max(n,1)

In [None]:
def train_loop(train_dl, valid_dl, model, device, epochs=5, lr=2e-3,
               patience=2, model_path="best.pt"):

    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-6)
    best_mse = float("inf")
    bad_epochs = 0

    for epoch in trange(epochs, desc="Epochs"):
        model.train()
        total_loss, total_samples = 0.0, 0
        pbar = tqdm(train_dl, desc=f"Train {epoch}", leave=False)

        for u, i, r in pbar:
            u, i, r = u.to(device), i.to(device), r.to(device)
            preds = model(u, i)
            loss = F.mse_loss(preds, r, reduction="sum")

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss.item()
            total_samples += r.size(0)
            running_mse = total_loss / max(total_samples, 1)
            pbar.set_postfix(MSE=f"{running_mse:.4f}",
                             RMSE=f"{running_mse**0.5:.4f}")

        # ---- Validation phase ----
        valid_mse = predict_mse(model, valid_dl, device, desc="Valid")
        valid_mae = predict_mae(model, valid_dl, device, desc="Valid")

        train_mse = total_loss / max(total_samples, 1)
        train_rmse = train_mse ** 0.5
        valid_rmse = valid_mse ** 0.5

        print(
            f"Epoch {epoch:02d} | "
            f"Train RMSE {train_rmse:.4f} | "
            f"Valid RMSE {valid_rmse:.4f} | "
            f"Valid MAE {valid_mae:.4f}"
        )

        # ---- Early stopping ----
        if valid_mse < best_mse:
            best_mse = valid_mse
            torch.save(model.state_dict(), model_path)
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping.")
                break

    print(f"Best valid RMSE: {best_mse**0.5:.4f}")


In [None]:
# ===== Replace your model init + training block with this =====
from torch.utils.data import DataLoader

cache_dir = OUT_DIR
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ds = STCachedDataset(cache_dir, "train", rc=10, S=40)
valid_ds = STCachedDataset(cache_dir, "valid", rc=10, S=40)
test_ds  = STCachedDataset(cache_dir, "test",  rc=10, S=40)

pin = torch.cuda.is_available()
g = torch.Generator(); g.manual_seed(42)

train_dl = DataLoader(train_ds, batch_size=128, shuffle=True,  pin_memory=pin, num_workers=0)
valid_dl = DataLoader(valid_ds, batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)
test_dl  = DataLoader(test_ds,  batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)

# IMPORTANT: glove_dim must match your cached sentence-embedding dim (= train_ds.H)
model = DeepCoNNTransformer(
    glove_dim=train_ds.H,
    rc=10,
    backbone="prajjwal1/bert-tiny",   # or 'sentence-transformers/all-MiniLM-L6-v2' if cached
    proj_dim=128,
    dropout=0.2,
    freeze_backbone=False
).to(device)





In [None]:
best_path = str(Path(cache_dir)/"best_model.pt")
train_loop(train_dl, valid_dl, model, device, epochs=5, model_path=best_path)



In [None]:
best = DeepCoNNTransformer(glove_dim=train_ds.H, rc=10, backbone="prajjwal1/bert-tiny").to(device)
best.load_state_dict(torch.load(best_path, map_location=device))
mse = predict_mse(best, test_dl, device, desc="Test")
mae = predict_mae(best, test_dl, device, desc="Test")
print(f"Test RMSE={mse**0.5:.4f}, MAE={mae:.4f}")

In [None]:
# ---- Cell: rating -> class bins (0,1,2)
def rating_to_class(r):
    # 1&2 -> 0 (neg), 3 -> 1 (neu), 4&5 -> 2 (pos)
    if r <= 2.0: 
        return 0
    elif r >= 4.0:
        return 2
    else:
        return 1


In [None]:
# ---- Cell: classification dataset wrapper
class STCachedDatasetCls(STCachedDataset):
    def __getitem__(self, idx):
        u_tensor, i_tensor, rating = super().__getitem__(idx)  # rating: float tensor [1]
        y = rating_to_class(float(rating.item()))
        return u_tensor, i_tensor, torch.tensor(y, dtype=torch.long)


In [None]:
# ---- Cell: classifier model
class DeepCoNNClassifier(nn.Module):
    def __init__(
        self,
        glove_dim: int,
        rc: int = 10,
        backbone: str = "prajjwal1/bert-tiny",
        proj_dim: int = 128,
        dropout: float = 0.2,
        freeze_backbone: bool = False,
        offline_only: bool = True,
        num_classes: int = 3,
        share_backbone: bool = False,  # set True if you want to share encoders
    ):
        super().__init__()
        self.rc = rc

        if share_backbone:
            shared = HFSeqEncoder(glove_dim, backbone, dropout, freeze_backbone, offline_only, proj_dim)
            self.user_enc = shared
            self.item_enc = shared
        else:
            self.user_enc = HFSeqEncoder(glove_dim, backbone, dropout, freeze_backbone, offline_only, proj_dim)
            self.item_enc = HFSeqEncoder(glove_dim, backbone, dropout, freeze_backbone, offline_only, proj_dim)

        hidden = self.user_enc.config.hidden_size
        self.out = nn.Sequential(
            nn.Linear(hidden * 2, hidden),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, num_classes)
        )

    def _encode_stack(self, x, route="user"):
        B, rc, S, H = x.shape
        x = x.view(B * rc, S, H)              # (B*rc, S, H)
        rep = self.user_enc(x) if route == "user" else self.item_enc(x)
        rep = rep.view(B, rc, -1).mean(dim=1) # (B, hidden), mean over rc
        return rep

    def forward(self, u, i):
        u_rep = self._encode_stack(u, route="user")
        i_rep = self._encode_stack(i, route="item")
        x = torch.cat([u_rep, i_rep], dim=-1)   # (B, 2*hidden)
        logits = self.out(x)                    # (B, 3)
        return logits


In [None]:
# ---- Cell: train/eval for classification
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import os

def evaluate_cls(model, dataloader, device, desc="Eval"):
    model.eval()
    all_y, all_pred = [], []
    with torch.no_grad():
        for u, i, y in tqdm(dataloader, desc=desc, leave=False):
            u, i, y = u.to(device), i.to(device), y.to(device)
            logits = model(u, i)
            preds = torch.argmax(logits, dim=1)
            all_y.append(y.cpu().numpy())
            all_pred.append(preds.cpu().numpy())
    y_true = np.concatenate(all_y) if all_y else np.array([])
    y_pred = np.concatenate(all_pred) if all_pred else np.array([])
    acc = accuracy_score(y_true, y_pred) if y_true.size else 0.0
    return acc, y_true, y_pred

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', fname=None):
    """
    Pure matplotlib (no seaborn). If fname is provided, saves the figure.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims=True).clip(min=1)
    fig, ax = plt.subplots(figsize=(5, 4), dpi=150)
    im = ax.imshow(cm, interpolation='nearest')
    ax.figure.colorbar(im, ax=ax)
    ax.set(
        xticks=np.arange(len(classes)),
        yticks=np.arange(len(classes)),
        xticklabels=classes, yticklabels=classes,
        ylabel='True label',
        title=title,
        xlabel='Predicted label',
    )
    # annotate cells
    fmtr = "{:.2f}" if normalize else "{:d}"
    thresh = cm.max() / 2. if cm.size else 0.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, fmtr.format(cm[i, j]),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    if fname:
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        plt.savefig(fname, bbox_inches='tight')
    plt.close(fig)

def train_loop_cls(train_dl, valid_dl, model, device, epochs=5, lr=2e-3,
                   patience=2, model_path="best_cls.pt", class_weights=None):
    model = model.to(device)
    # optional class weighting for imbalance
    loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights.to(device) if class_weights is not None else None)
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-6)

    best_acc = -1.0
    bad_epochs = 0

    for epoch in trange(epochs, desc="Epochs"):
        model.train()
        running_loss, seen = 0.0, 0
        pbar = tqdm(train_dl, desc=f"Train {epoch}", leave=False)
        for u, i, y in pbar:
            u, i, y = u.to(device), i.to(device), y.to(device)
            logits = model(u, i)
            loss = loss_fn(logits, y)

            opt.zero_grad()
            loss.backward()
            opt.step()

            running_loss += loss.item() * y.size(0)
            seen += y.size(0)
            pbar.set_postfix(LOSS=f"{running_loss/max(seen,1):.4f}")

        # Evaluate
        train_acc, _, _ = evaluate_cls(model, train_dl, device, desc="Train Eval")
        valid_acc, _, _ = evaluate_cls(model, valid_dl, device, desc="Valid Eval")

        print(f"Epoch {epoch:02d} | Train Acc {train_acc:.4f} | Valid Acc {valid_acc:.4f}")

        # Early stopping on accuracy
        if valid_acc > best_acc:
            best_acc = valid_acc
            torch.save(model.state_dict(), model_path)
            bad_epochs = 0
        else:
            bad_epochs += 1
            if bad_epochs >= patience:
                print("Early stopping.")
                break

    print(f"Best Valid Accuracy: {best_acc:.4f}")


In [None]:
# ---- Cell: data loaders for classification
train_ds_c = STCachedDatasetCls(cache_dir, "train", rc=10, S=40)
valid_ds_c = STCachedDatasetCls(cache_dir, "valid", rc=10, S=40)
test_ds_c  = STCachedDatasetCls(cache_dir, "test",  rc=10, S=40)

pin = torch.cuda.is_available()
g = torch.Generator(); g.manual_seed(42)

train_dl_c = DataLoader(train_ds_c, batch_size=128, shuffle=True,  pin_memory=pin, num_workers=0)
valid_dl_c = DataLoader(valid_ds_c, batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)
test_dl_c  = DataLoader(test_ds_c,  batch_size=128, shuffle=False, pin_memory=pin, num_workers=0)

# Optional: class weights for imbalance (computed from training labels)
from collections import Counter
labels_train = []
for _, _, y in DataLoader(train_ds_c, batch_size=1024, shuffle=False, num_workers=0):
    labels_train.extend(y.numpy().tolist())
cnt = Counter(labels_train)
num_classes = 3
total = sum(cnt.values())
weights = torch.tensor([total / max(cnt.get(c,1),1) for c in range(num_classes)], dtype=torch.float32)
print("Class counts:", cnt, " -> weights:", weights.tolist())

# ---- Cell: init and train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clf = DeepCoNNClassifier(
    glove_dim=train_ds_c.H,  # MUST equal cached sentence dim (e.g., 100 for GloVe 6B-100d)
    rc=10,
    backbone="prajjwal1/bert-tiny",
    proj_dim=128,
    dropout=0.2,
    freeze_backbone=False,
    offline_only=True,
    num_classes=3,
    share_backbone=False
).to(device)

best_cls_path = os.path.join(MODEL_DIR, "best_model_cls.pt")
train_loop_cls(
    train_dl_c, valid_dl_c, clf, device,
    epochs=5, lr=2e-3, patience=2,
    model_path=best_cls_path,
    class_weights=weights
)

# ---- Cell: load best, evaluate on test, print report, plot confusion matrix
best_clf = DeepCoNNClassifier(
    glove_dim=train_ds_c.H, rc=10, backbone="prajjwal1/bert-tiny",
    proj_dim=128, dropout=0.2, freeze_backbone=False, offline_only=True,
    num_classes=3, share_backbone=False
).to(device)
best_clf.load_state_dict(torch.load(best_cls_path, map_location=device))

test_acc, y_true, y_pred = evaluate_cls(best_clf, test_dl_c, device, desc="Test")
print(f"Test Accuracy = {test_acc:.4f}")

# Classification report
target_names = ["negative","neutral","positive"]
report = classification_report(y_true, y_pred, target_names=target_names, digits=4)
print(report)

# Save report
os.makedirs(FIG_DIR, exist_ok=True)
with open(os.path.join(FIG_DIR, "classification_report.txt"), "w") as f:
    f.write(f"Test Accuracy: {test_acc:.4f}\n\n")
    f.write(report)

# Confusion matrix (raw and normalized) + save figures
cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
plot_confusion_matrix(cm, classes=target_names, normalize=False,
                      title="Confusion Matrix (Counts)",
                      fname=os.path.join(FIG_DIR, "confusion_matrix_counts.png"))
plot_confusion_matrix(cm, classes=target_names, normalize=True,
                      title="Confusion Matrix (Row-Normalized)",
                      fname=os.path.join(FIG_DIR, "confusion_matrix_normalized.png"))

print("Saved:",
      os.path.join(FIG_DIR, "classification_report.txt"), ",",
      os.path.join(FIG_DIR, "confusion_matrix_counts.png"), ",",
      os.path.join(FIG_DIR, "confusion_matrix_normalized.png"))
