In [21]:
import pytorch_lightning as L 
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F


In [22]:
data_path = Path("/kaggle/input/playground-series-s5e12")
# data_path = Path("./data/diabetes_prediction")

In [23]:
train_path = data_path / "train.csv"
test_path = data_path / "test.csv"
submission_path = data_path / "sample_submission.csv"

In [24]:
import pandas as pd

class SimpleDataset(torch.utils.data.Dataset):

    def __init__(self, 
                 X_num: pd.DataFrame,
                 X_cat: pd.DataFrame,
                 y: pd.Series | None = None):
        
        self.X_num = X_num.to_numpy()
        self.X_cat = X_cat.to_numpy()

        self.y = y
        if y is not None: 
            self.y = y.to_numpy().reshape(-1, 1)
        

    def __len__(self):
        return len(self.X_num)

    def __getitem__(self, idx):
        x_num = torch.from_numpy(self.X_num[idx]).to(torch.float32)
        x_cat = torch.from_numpy(self.X_cat[idx]).to(torch.long)
        if self.y is not None:
            y = torch.from_numpy(self.y[idx])
            return x_num, x_cat, y
        return x_num, x_cat

In [25]:
train_data = pd.read_csv(train_path)

In [28]:
from typing import List
from sklearn.metrics import roc_auc_score


class RankingNetwork(L.LightningModule):

    def __init__(self,
                 num_numerical_features: int,
                 num_categorical_features: int,
                 categorical_sizes: List[int],
                 embedding_sizes: List[int],
                 hidden_size_1: int = 256,
                 hidden_size_2: int = 64,
                 margin: float = 0.3,
                 margin_loss_weight: float = 0.1,
                 lr=1e-4):
        
        super().__init__()
        self.lr = lr
        self.margin_loss_weight = margin_loss_weight
        categorical_contribution = sum(embedding_sizes)


        self.embedding = nn.ModuleList([
            nn.Embedding(num_embeddings=categorical_sizes[i], embedding_dim=embedding_sizes[i])
            for i in range(num_categorical_features)
        ])

        self.model = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(num_numerical_features + categorical_contribution, hidden_size_1),
            nn.BatchNorm1d(hidden_size_1),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size_1, hidden_size_2),
            nn.BatchNorm1d(hidden_size_2),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size_2, 1)
        )

        self.margin_loss = nn.MarginRankingLoss(margin=margin)
        self.binary_loss = nn.BCELoss()
        self.auc_loss = AUCMLoss(margin=0.2, version='v2', device=self.device, imratio=436307/263693)

    def forward(self, x_num: torch.Tensor, x_cat: torch.Tensor):
        embeddings = []
        for i, emb_layer in enumerate(self.embedding):
            embeddings.append(emb_layer(x_cat[:, i]))
        x = torch.hstack([x_num, *embeddings])
        out = self.model(x)
        return out
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)
    
    def training_step(self, batch, batch_idx):
        x_num, x_cat, y = batch
        y_hat: torch.Tensor = self(x_num, x_cat)

        negative_examples = y_hat[y == 0]
        positive_examples = y_hat[y == 1]
        dup_negative_examples = negative_examples.repeat(len(positive_examples))
        dup_positive_examples = positive_examples.repeat_interleave(len(negative_examples))
        margin_loss = self.margin_loss(dup_positive_examples, dup_negative_examples, torch.ones_like(dup_negative_examples))
        bin_loss = self.binary_loss(F.sigmoid(y_hat), y.to(torch.float32))

        self.log("train_margin_loss", margin_loss, prog_bar=True)
        # self.log("train_binary_loss", bin_loss, prog_bar=True)
        # return bin_loss

        # auc_loss = self.auc_loss(y_hat, y)
        # self.log("train_auc_loss", auc_loss, prog_bar=True)
        return bin_loss
    
    def validation_step(self, batch, batch_idx):
        x_num, x_cat, y = batch
        y_hat = self(x_num, x_cat)
        auc_score = roc_auc_score(y.squeeze().cpu().numpy(), y_hat.squeeze().detach().cpu().numpy())
        self.log("auc", auc_score, prog_bar=True)
        return auc_score
    
    def predict_step(self, batch, batch_idx):
        x_num, x_cat = batch
        y_hat = self(x_num, x_cat)
        return y_hat.squeeze()

In [29]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from pytorch_lightning.trainer import Trainer
from pytorch_lightning.callbacks import LearningRateFinder, StochasticWeightAveraging

In [30]:
import numpy as np


class DataModule(L.LightningDataModule):

    def __init__(self):
        super().__init__()

    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return val_loader
    
    def test_dataloader(self):
        return test_loader



kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

test_data = pd.read_csv(test_path).drop(columns=["id"])
y = train_data.pop("diagnosed_diabetes")
X = train_data.drop(columns=["id"])
final_test_preds = np.zeros(len(test_data))


for (train_idx, val_idx) in kfold.split(X, y):

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # preprocessing without leakage

    X_num_train = X_train.select_dtypes(include=["number"])
    X_cat_train = X_train.select_dtypes(include=["object"])

    X_num_val = X_val.select_dtypes(include=["number"])
    X_cat_val = X_val.select_dtypes(include=["object"])

    X_num_test = test_data.select_dtypes(include=["number"])
    X_cat_test = test_data.select_dtypes(include=["object"])

    scaler = StandardScaler()
    ordinaliser = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    
    X_num_train = pd.DataFrame(scaler.fit_transform(X_num_train), columns=X_num_train.columns)
    X_num_val = pd.DataFrame(scaler.transform(X_num_val), columns=X_num_val.columns)

    X_cat_train = pd.DataFrame(ordinaliser.fit_transform(X_cat_train), columns=X_cat_train.columns)
    X_cat_val = pd.DataFrame(ordinaliser.transform(X_cat_val), columns=X_cat_val.columns)

    X_num_test = pd.DataFrame(scaler.transform(X_num_test), columns=X_num_test.columns)
    X_cat_test = pd.DataFrame(ordinaliser.transform(X_cat_test), columns=X_cat_test.columns)

    train_loader = torch.utils.data.DataLoader(SimpleDataset(X_num_train, X_cat_train, y_train), batch_size=128, shuffle=True)
    val_loader = torch.utils.data.DataLoader(SimpleDataset(X_num_val, X_cat_val, y_val), batch_size=256, shuffle=False)
    test_loader = torch.utils.data.DataLoader(SimpleDataset(X_num_test, X_cat_test), batch_size=256, shuffle=False)

    trainer = Trainer(max_epochs=10, accelerator="auto", callbacks=[StochasticWeightAveraging(1e-4)])

    ranker = RankingNetwork(
        num_numerical_features=X_num_train.shape[1],
        num_categorical_features=X_cat_train.shape[1],
        categorical_sizes=[int(X_cat_train[col].max()) + 1 for col in X_cat_train.columns],
        embedding_sizes=[10 for col in X_cat_train.columns],
    )

    trainer.fit(
        ranker,
        DataModule()
    )

    test_preds = trainer.predict(ranker, test_loader)
    np_test_preds = torch.cat(test_preds).numpy()
    final_test_preds += np_test_preds / kfold.n_splits

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | embedding   | ModuleList        | 240    | train
1 | model       | Sequential        | 37.4 K | train
2 | margin_loss | MarginRankingLoss | 0      | train
3 | binary_loss | BCELoss           | 0      | train
4 | auc_loss    | AUCMLoss          | 0      | train
----------------------------------------------------------
37.6 K    Trainable params
0         Non-trainable params
37.6 K    Total params
0.150     Total estimated model params size (MB)
21        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=11` reached.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | embedding   | ModuleList        | 240    | train
1 | model       | Sequential        | 37.4 K | train
2 | margin_loss | MarginRankingLoss | 0      | train
3 | binary_loss | BCELoss           | 0      | train
4 | auc_loss    | AUCMLoss          | 0      | train
----------------------------------------------------------
37.6 K    Trainable params
0         Non-trainable params
37.6 K    Total params
0.150     Total estimated model params size (MB)
21        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/call.py", line 49, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/trainer.py", line 598, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/trainer.py", line 1011, in _run
    results = self._run_stage()
              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/trainer.py", line 1055, in _run_stage
    self.fit_loop.run()
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loops/fit_loop.py", line 216, in run
    self.advance()
  File "/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loops/fit_loop.py", line 458, in advance
    self.epoch_loop.run(self._data_fetcher)
  File "/usr/local/lib/python3.11

TypeError: object of type 'NoneType' has no len()

In [None]:
sample_sub = pd.read_csv(submission_path, index_col=0)
sample_sub["diagnosed_diabetes"] = final_test_preds
sample_sub.to_csv("/kaggle/working/diabetes_prediction_rankingnn_submission.csv")