## Q4 NAM

In [None]:
import pandas as pd

In [None]:
X_train = pd.read_csv('data/processed_data/X_train_processed.csv')
y_train = pd.read_csv('data/processed_data/y_train_processed.csv')['HeartDisease']
X_test = pd.read_csv('data/processed_data/X_test_processed.csv')
y_test = pd.read_csv('data/processed_data/y_test.csv')['HeartDisease']

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Check feature names to understand the data
print("\nFeatures in the dataset:")
print(X_train.columns.tolist())

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
import pandas as pd

# 1) Define a Dataset that wraps X (DataFrame) and y (Series)
class HeartDiseaseDataset(Dataset):
    def __init__(self, X: pd.DataFrame, y: pd.Series, regression: bool):
        # convert to contiguous numpy arrays then tensors
        self.X = torch.tensor(X.values, dtype=torch.float32)
        dtype_target = torch.float32 if regression else torch.long
        self.y = torch.tensor(y.values, dtype=dtype_target)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


def build_dataloaders(config):
    # 2) Load your processed CSVs
    X = pd.read_csv('data/processed_data/X_train_processed.csv')
    y = pd.read_csv('data/processed_data/y_train_processed.csv')['HeartDisease']

    # 3) Prepare K-Fold splits
    kf = KFold(
        n_splits=config.n_folds,
        shuffle=True,
        random_state=42
    )

    dataloaders = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_ds = HeartDiseaseDataset(X_tr, y_tr, regression=config.regression)
        val_ds   = HeartDiseaseDataset(X_val, y_val, regression=config.regression)

        train_loader = DataLoader(
            train_ds,
            batch_size=config.batch_size,
            shuffle=True,
            num_workers=config.num_workers
        )
        val_loader = DataLoader(
            val_ds,
            batch_size=config.batch_size,
            shuffle=False,
            num_workers=config.num_workers
        )
        dataloaders.append((train_loader, val_loader))

    # 4) Optionally build a test_loader
    X_t = pd.read_csv('data/processed_data/X_test_processed.csv')
    y_t = pd.read_csv('data/processed_data/y_test.csv')['HeartDisease']
    test_ds = HeartDiseaseDataset(X_t, y_t, regression=config.regression)
    test_loader = DataLoader(
        test_ds,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=config.num_workers
    )

    return dataloaders, test_loader


In [None]:
import os
import torch
from torch.utils.tensorboard import SummaryWriter
from nam_trainer_pytorch import NAMTrainer  # wherever you put it

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataloaders, test_loader = build_dataloaders(config)

for fold, (train_loader, val_loader) in enumerate(dataloaders):
    # 1. set up TensorBoard writer
    log_dir = os.path.join(
        config.logdir,
        model.name,
        f'fold_{fold + 1}'
    )
    os.makedirs(log_dir, exist_ok=True)
    writer = SummaryWriter(log_dir=log_dir)

    # 2. initialize our pure-PyTorch trainer
    trainer = NAMTrainer(config, model, device=device)

    # 3. keep track of top-k checkpoints by val loss
    best_ckpts = []  # list of (val_loss, checkpoint_path)

    # 4. training loop
    for epoch in range(1, config.num_epochs + 1):
        print(f"=== Fold {fold + 1}, Epoch {epoch}/{config.num_epochs} ===")
        train_loss, train_metric = trainer.train_epoch(train_loader)
        val_loss,   val_metric   = trainer.validate_epoch(val_loader)

        # 5. log scalars to TensorBoard
        writer.add_scalars('Loss', {
            'train': train_loss,
            'val':   val_loss
        }, epoch)
        writer.add_scalars(trainer.metric_name, {
            'train': train_metric,
            'val':   val_metric
        }, epoch)

        # 6. checkpoint every epoch, keep best `save_top_k`
        ckpt_filename = f'epoch_{epoch:02d}-val_{val_loss:.4f}.pt'
        ckpt_path     = os.path.join(log_dir, ckpt_filename)
        torch.save({
            'epoch':                  epoch,
            'model_state_dict':       model.state_dict(),
            'optimizer_state_dict':   trainer.optimizer.state_dict()
        }, ckpt_path)

        # track & prune
        best_ckpts.append((val_loss, ckpt_path))
        best_ckpts = sorted(best_ckpts, key=lambda x: x[0])
        if len(best_ckpts) > config.save_top_k:
            # remove worst
            worst_loss, worst_path = best_ckpts.pop(-1)
            if os.path.exists(worst_path):
                os.remove(worst_path)

    writer.close()



In [None]:
trainer.test(test_loader)