In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error

# Experiment settings
train_ratios = [0.5, 0.7, 0.9]
seeds = range(20)
output_path = "./vae_cvae_ncf_results"
os.makedirs(output_path, exist_ok=True)

# Load MovieLens 100k
def load_movielens_100k():
    from surprise import Dataset
    data = Dataset.load_builtin('ml-100k')
    df = pd.DataFrame(data.raw_ratings, columns=['user', 'item', 'rating', 'timestamp'])
    df['user'] = df['user'].astype('category').cat.codes
    df['item'] = df['item'].astype('category').cat.codes
    n_users = df['user'].nunique()
    n_items = df['item'].nunique()
    matrix = torch.zeros((n_users, n_items))
    for row in df.itertuples():
        matrix[row.user, row.item] = row.rating
    return matrix, torch.eye(n_users), n_users, n_items

# Dataset classes
class RatingDataset(Dataset):
    def __init__(self, X, mask, U=None):
        self.X = X
        self.mask = mask
        self.U = U
    def __len__(self): return self.X.size(0)
    def __getitem__(self, idx):
        if self.U is None:
            return self.X[idx], self.mask[idx]
        return self.X[idx], self.mask[idx], self.U[idx]

class NCFDataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return (
            torch.tensor(row['user_id'], dtype=torch.long),
            torch.tensor(row['item_id'], dtype=torch.long),
            torch.tensor(row['rating'], dtype=torch.float)
        )

# Models
class VAE(nn.Module):
    def __init__(self, n_items, dim=50):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(n_items, 200), nn.ReLU(), nn.Linear(200, dim * 2))
        self.decoder = nn.Sequential(nn.Linear(dim, 200), nn.ReLU(), nn.Linear(200, n_items), nn.Sigmoid())
    def forward(self, x):
        h = self.encoder(x)
        mu, logvar = h.chunk(2, dim=-1)
        std = torch.exp(0.5 * logvar)
        z = mu + std * torch.randn_like(std)
        x_recon = self.decoder(z) * 4 + 1  # Rescale [0,1] → [1,5]
        return x_recon, mu, logvar

class CVAE(nn.Module):
    def __init__(self, n_items, user_dim, dim=50):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(n_items + user_dim, 200), nn.ReLU(), nn.Linear(200, dim * 2))
        self.decoder = nn.Sequential(nn.Linear(dim + user_dim, 200), nn.ReLU(), nn.Linear(200, n_items), nn.Sigmoid())
    def forward(self, x, u):
        h = self.encoder(torch.cat([x, u], dim=1))
        mu, logvar = h.chunk(2, dim=-1)
        std = torch.exp(0.5 * logvar)
        z = mu + std * torch.randn_like(std)
        x_recon = self.decoder(torch.cat([z, u], dim=1)) * 4 + 1
        return x_recon, mu, logvar

class NCF(nn.Module):
    def __init__(self, n_users, n_items, dim=50):
        super().__init__()
        self.u = nn.Embedding(n_users, dim)
        self.i = nn.Embedding(n_items, dim)
        self.mlp = nn.Sequential(nn.Linear(2*dim, 64), nn.ReLU(), nn.Linear(64, 1))
    def forward(self, user, item):
        return self.mlp(torch.cat([self.u(user), self.i(item)], dim=1)).squeeze()

# Train/test split
def split_train_test(X, ratio):
    mask_train = torch.zeros_like(X, dtype=torch.bool)
    mask_test = torch.zeros_like(X, dtype=torch.bool)
    for i in range(X.size(0)):
        idx = torch.nonzero(X[i] > 0, as_tuple=True)[0]
        if len(idx) < 2: continue
        n = int(len(idx) * ratio)
        perm = torch.randperm(len(idx))
        mask_train[i, idx[perm[:n]]] = 1
        mask_test[i, idx[perm[n:]]] = 1
    return X * mask_train, X * mask_test, mask_train, mask_test

# VAE training
def train_vae(model, X, mask, optimizer, epochs=20):
    loader = DataLoader(RatingDataset(X, mask), batch_size=64, shuffle=True)
    model.train()
    for _ in range(epochs):
        for x, m in loader:
            pred, mu, logvar = model(x)
            recon = ((pred - x) ** 2 * m).sum() / m.sum()
            kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / x.size(0)
            loss = recon + 1e-4 * kl
            optimizer.zero_grad(); loss.backward(); optimizer.step()

# CVAE training
def train_cvae(model, X, mask, U, optimizer, epochs=20):
    loader = DataLoader(RatingDataset(X, mask, U), batch_size=64, shuffle=True)
    model.train()
    for _ in range(epochs):
        for x, m, u in loader:
            pred, mu, logvar = model(x, u)
            recon = ((pred - x) ** 2 * m).sum() / m.sum()
            kl = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) / x.size(0)
            loss = recon + 1e-4 * kl
            optimizer.zero_grad(); loss.backward(); optimizer.step()

# NCF training
def train_ncf(model, loader, optimizer, epochs=10):
    model.train()
    for _ in range(epochs):
        for u, i, r in loader:
            pred = model(u, i)
            loss = nn.functional.mse_loss(pred, r)
            optimizer.zero_grad(); loss.backward(); optimizer.step()

# VAE / CVAE evaluation
def eval_vae(model, X_test, mask_test, U=None):
    model.eval()
    with torch.no_grad():
        pred, _, _ = model(X_test) if U is None else model(X_test, U)
        mse = ((pred - X_test)**2 * mask_test).sum() / mask_test.sum()
    return mse.sqrt().item()

# NCF evaluation
def eval_ncf(model, df):
    model.eval()
    with torch.no_grad():
        users = torch.tensor(df['user_id'].values, dtype=torch.long)
        items = torch.tensor(df['item_id'].values, dtype=torch.long)
        true = torch.tensor(df['rating'].values, dtype=torch.float)
        pred = model(users, items)
    return np.sqrt(mean_squared_error(true.numpy(), pred.numpy()))

# Run full experiment
all_results = []
X, U, n_users, n_items = load_movielens_100k()

for seed in seeds:
    torch.manual_seed(seed)
    np.random.seed(seed)

    for ratio in train_ratios:
        Xtr, Xte, mtr, mte = split_train_test(X, ratio)

        # VAE
        vae = VAE(n_items)
        opt = optim.Adam(vae.parameters(), lr=1e-3)
        train_vae(vae, Xtr, mtr, opt)
        rmse_vae = eval_vae(vae, Xte, mte)

        # cVAE
        cvae = CVAE(n_items, U.shape[1])
        opt = optim.Adam(cvae.parameters(), lr=1e-3)
        train_cvae(cvae, Xtr, mtr, U, opt)
        rmse_cvae = eval_vae(cvae, Xte, mte, U)

        # NCF
        df_train, df_test = [], []
        for i in range(Xtr.size(0)):
            for j in torch.nonzero(mtr[i], as_tuple=True)[0]:
                df_train.append({'user_id': i, 'item_id': j.item(), 'rating': Xtr[i, j].item()})
            for j in torch.nonzero(mte[i], as_tuple=True)[0]:
                df_test.append({'user_id': i, 'item_id': j.item(), 'rating': Xte[i, j].item()})
        df_train = pd.DataFrame(df_train); df_test = pd.DataFrame(df_test)
        ncf = NCF(n_users, n_items)
        opt = optim.Adam(ncf.parameters(), lr=1e-3)
        train_ncf(ncf, DataLoader(NCFDataset(df_train), batch_size=256, shuffle=True), opt)
        rmse_ncf = eval_ncf(ncf, df_test)

        all_results.append({
            'seed': seed, 'train_ratio': ratio,
            'VAE_RMSE': rmse_vae,
            'cVAE_RMSE': rmse_cvae,
            'NCF_RMSE': rmse_ncf
        })
        print(f"Seed={seed}, Ratio={ratio} -> VAE={rmse_vae:.4f}, cVAE={rmse_cvae:.4f}, NCF={rmse_ncf:.4f}")

# Save results
df_results = pd.DataFrame(all_results)
df_results.to_csv(os.path.join(output_path, "benchmark_results.csv"), index=False)
print("\n✅ Results saved to:", os.path.join(output_path, "benchmark_results.csv"))


Seed=0, Ratio=0.5 -> VAE=1.1139, cVAE=1.0797, NCF=0.9803
