In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from os import path
import os

sys.path.append(os.path.abspath("../utils"))
sys.path.append(os.path.abspath("../models"))

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.decomposition import PCA
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from simple_nn_lightning import *
from model_utils import *
from simple_nn_utils import *


import wandb

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from pytorch_lightning import Trainer, seed_everything

from pl_bolts.callbacks import TrainingDataMonitor

from torch.optim.lr_scheduler import MultiStepLR

In [4]:
INPUT_DIM = 20
N_CLASSES = 5

In [5]:
# setting device on GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
fake_data, fake_targets = make_multilabel_classification(
    n_samples=100,
    n_features=INPUT_DIM,
    n_classes=N_CLASSES,
    allow_unlabeled=False,
    random_state=314,
)

In [7]:
indices = range(fake_data.shape[0])
X_train, X_test, y_train, y_test, idxs_train, idxs_test = train_test_split(
    fake_data,
    fake_targets,
    indices,
    #                                                     stratify=train_test_stratify, # don't know how this works with multilabel
    test_size=0.2,
    random_state=314,
)
# X_train = torch.tensor([[1,3,5],[50,100,10]])
# y_train = torch.tensor([[0,1],[1,1]])
# X_test = torch.tensor([[1,3,5],[50,100,10]])
# y_test = torch.tensor([[0,1],[1,1]])

In [8]:
X_train, X_test, y_train, y_test

(array([[1., 4., 1., ..., 1., 2., 0.],
        [2., 2., 0., ..., 0., 1., 2.],
        [3., 3., 1., ..., 6., 3., 3.],
        ...,
        [4., 6., 5., ..., 0., 3., 0.],
        [4., 5., 4., ..., 0., 0., 3.],
        [4., 2., 4., ..., 0., 1., 1.]]),
 array([[ 5.,  3.,  3.,  4.,  3.,  5.,  2.,  5.,  4.,  4.,  0.,  1.,  1.,
          3.,  3.,  1.,  4.,  0.,  3.,  0.],
        [ 2.,  2.,  1.,  3.,  0.,  2.,  0.,  3.,  3.,  2.,  1.,  3.,  4.,
          5.,  2.,  4.,  3.,  2.,  2.,  2.],
        [ 1.,  4.,  2.,  4.,  0.,  1.,  2.,  3.,  3.,  1.,  2.,  0.,  7.,
          1.,  2.,  4.,  3.,  1.,  3.,  1.],
        [ 1.,  3.,  5.,  4.,  0.,  0.,  5.,  2.,  1.,  3.,  2.,  0.,  2.,
          3.,  3.,  4.,  3.,  2.,  3.,  3.],
        [ 3.,  2.,  5.,  0.,  4.,  4.,  4.,  1.,  1.,  5.,  4.,  1.,  1.,
         11.,  2.,  1.,  4.,  1.,  5.,  1.],
        [ 3.,  3.,  1.,  6.,  2.,  2.,  2.,  1.,  2.,  2.,  3.,  1.,  3.,
          4.,  1.,  1.,  2.,  3.,  1.,  5.],
        [ 3.,  1.,  3.,  1.,  4.,  0.

In [9]:
# we need to store this value for the NN model definition
INPUT_DIM = X_train.shape[1]
N_CLASSES = y_train.shape[1]


In [10]:
train_dataset = TensorDataset(
    torch.tensor(X_train).float(), torch.tensor(y_train).float()
)
val_dataset = TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).float())

train_loader = DataLoader(
    train_dataset,
    batch_size=5,
    #  collate_fn=collate_wrapper,
    pin_memory=True,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=5,
    #  collate_fn=collate_wrapper,
    pin_memory=True,
)

In [11]:
config_fp = f"/home/vs428/Documents/deep-ed-diags/configs/dev_test_config.yaml"

fast_dev_run = False
# impacts how quickly we do earlystopping too by patience
eval_freq = 2

wandb.init(project="test-project", 
           entity="decile",
           config=config_fp, 
           allow_val_change=True,
           save_code=True)

WANDB_RUN_NAME = wandb.run.name


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvsocrates[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.10 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [12]:

# just fix some issue of conditional params
if wandb.config['loss_fn'] == "focal":
    wandb.config.update({'class_weight_type': None})
if fast_dev_run:
    wandb.config.update({'drop_sparse_cols': 0}, allow_val_change=True)


In [13]:
class_weights=None

In [14]:
# Callbacks
lr_monitor = LearningRateMonitor(logging_interval='epoch')
# log the histograms of input data sent to LightningModule.training_step
training_data_monitor = TrainingDataMonitor(log_every_n_steps=25)
print_callback = PrintCallback()

early_stopping = EarlyStopping(min_delta=0.00001, patience=2, verbose=True, monitor="validation_loss")
checkpoint_callback = ModelCheckpoint(dirpath="/gpfs/milgram/project/rtaylor/shared/ABDPain_EarlyDiags/models",
                                      filename=f"{WANDB_RUN_NAME}.model",
                                      monitor="validation_loss")

# Predict after trainer callback using 20% of the validation dataset
after_train_dataset = val_dataset[np.random.choice(len(val_dataset), int(len(val_dataset)*0.2), replace=False)]
val_preds_logger = PredictionLogger(after_train_dataset)

callbacks = [lr_monitor, training_data_monitor, print_callback, early_stopping, checkpoint_callback, val_preds_logger]

# Logger
wandb_logger = WandbLogger(project="test-project")    

trainer = Trainer(logger=wandb_logger,
                 callbacks=callbacks,
                 check_val_every_n_epoch=eval_freq,
                 devices="auto", accelerator="auto",
                  fast_dev_run=fast_dev_run
                 )


if wandb.config['loss_fn'] == "focal":
    loss = MultilabelFocalLoss(N_CLASSES, gamma=wandb.config["focal_loss_gamma"])
elif wandb.config['loss_fn'] == "bce":
    loss = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
else:
    loss = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)


mlp_system = LitAbdPainPredictionMLP(
    INPUT_DIM,
    N_CLASSES,
    config=wandb.config,
    loss_fn=loss,
    layer_size=wandb.config["layer_size"],
    dropout=wandb.config["dropout"],
)

print(mlp_system, flush=True)

trainer.fit(mlp_system, train_loader, val_loader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


LitAbdPainPredictionMLP(
  (fc1): Linear(in_features=20, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=5, bias=True)
  (bn1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (loss): MultilabelFocalLoss()
)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name    | Type                | Params
------------------------------------------------
0 | fc1     | Linear              | 336   
1 | fc2     | Linear              | 272   
2 | fc4     | Linear              | 85    
3 | bn1     | BatchNorm1d         | 32    
4 | bn2     | BatchNorm1d         | 32    
5 | dropout | Dropout             | 0     
6 | loss    | MultilabelFocalLoss | 0     
------------------------------------------------
757       Trainable params
0         Non-trainable params
757       Total params
0.003     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

mean:  tensor(0.3528, device='cuda:0')
mean:  tensor(0.3479, device='cuda:0')
Validation Precision/macro: 0.5454545617103577
Validation Recall/macro: 0.6000000238418579
Training is started!


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: 0it [00:00, ?it/s]

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"


mean:  tensor(0.4170, device='cuda:0', grad_fn=<MeanBackward0>)
# correct?:
 tensor(13, device='cuda:0')
tensor(2.0848, device='cuda:0', grad_fn=<MulBackward0>)
mean:  tensor(0.4294, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3980, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3517, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3532, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3739, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3268, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3119, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3910, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2980, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3212, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3360, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.3503, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2852, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  

Validating: 0it [00:00, ?it/s]

Metric validation_loss improved. New best score: 1.511


mean:  tensor(0.2646, device='cuda:0')
mean:  tensor(0.3858, device='cuda:0')
mean:  tensor(0.2809, device='cuda:0')
mean:  tensor(0.2774, device='cuda:0')
Validation Precision/macro: 0.75
Validation Recall/macro: 0.6000000238418579
mean:  tensor(0.2254, device='cuda:0', grad_fn=<MeanBackward0>)
# correct?:
 tensor(21, device='cuda:0')
tensor(1.1270, device='cuda:0', grad_fn=<MulBackward0>)
mean:  tensor(0.2204, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2544, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1902, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2151, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2632, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2797, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1819, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2677, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2750, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2061, device='

Validating: 0it [00:00, ?it/s]

Metric validation_loss improved by 0.176 >= min_delta = 1e-05. New best score: 1.335


mean:  tensor(0.2210, device='cuda:0')
mean:  tensor(0.4045, device='cuda:0')
mean:  tensor(0.1785, device='cuda:0')
mean:  tensor(0.2638, device='cuda:0')
Validation Precision/macro: 0.7272727489471436
Validation Recall/macro: 0.800000011920929
mean:  tensor(0.1863, device='cuda:0', grad_fn=<MeanBackward0>)
# correct?:
 tensor(20, device='cuda:0')
tensor(0.9314, device='cuda:0', grad_fn=<MulBackward0>)
mean:  tensor(0.1749, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2177, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1329, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1712, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1730, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1895, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1453, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2158, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2195, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1

Validating: 0it [00:00, ?it/s]

mean:  tensor(0.2055, device='cuda:0')
mean:  tensor(0.3835, device='cuda:0')
mean:  tensor(0.1917, device='cuda:0')
mean:  tensor(0.2940, device='cuda:0')
Validation Precision/macro: 0.7272727489471436
Validation Recall/macro: 0.800000011920929
mean:  tensor(0.1556, device='cuda:0', grad_fn=<MeanBackward0>)
# correct?:
 tensor(22, device='cuda:0')
tensor(0.7779, device='cuda:0', grad_fn=<MulBackward0>)
mean:  tensor(0.1602, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1473, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1238, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1241, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1483, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1316, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1306, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.1676, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.2058, device='cuda:0', grad_fn=<MeanBackward0>)
mean:  tensor(0.0

Validating: 0it [00:00, ?it/s]

Monitored metric validation_loss did not improve in the last 2 records. Best score: 1.335. Signaling Trainer to stop.


mean:  tensor(0.1897, device='cuda:0')
mean:  tensor(0.4541, device='cuda:0')
mean:  tensor(0.2178, device='cuda:0')
mean:  tensor(0.3142, device='cuda:0')
Validation Precision/macro: 0.800000011920929
Validation Recall/macro: 0.800000011920929
Training is done.


In [15]:
# Testing seems to work on a synthetic dataset