In [1]:
# import pandas as pd
# import torch.utils.data

In [2]:
# Importing lemoncake package - to be able to import code that does not need changing
import sys
sys.path.append('../') 

In [3]:
from lemoncake.data import get_datasets, get_dataloaders

In [4]:
train_ds, val_ds, test_ds = get_datasets()

In [5]:
train_ds.label_counts

{'Atelectasis': 14336.0,
 'Cardiomegaly': 15279.0,
 'Consolidation': 3558.0,
 'Edema': 10310.0,
 'Enlarged Cardiomediastinum': 2309.0,
 'Fracture': 508.0,
 'Lung Lesion': 860.0,
 'Lung Opacity': 13235.0,
 'No Finding': 5013.0,
 'Pleural Effusion': 20322.0,
 'Pleural Other': 305.0,
 'Pneumonia': 3730.0,
 'Pneumothorax': 2902.0}

In [6]:
len(train_ds)

43738

In [7]:
train_ds.get_pos_weights(), val_ds.get_pos_weights(), test_ds.get_pos_weights()

(tensor([  2.,   2.,  11.,   3.,  18.,  85.,  50.,   2.,   8.,   1., 142.,  11.,
          14.]),
 tensor([ 3.,  2., 20.,  3., 22., inf, 39.,  2.,  8.,  2., 52.,  9., 10.]),
 tensor([ 1.,  2., 16.,  2., 14., 75., 46.,  2., 13.,  1., 82., 10., 19.]))

In [8]:
dls = get_dataloaders({'train': train_ds, 'valid': val_ds, 'test': test_ds}, batch_size=32)

In [9]:
train_dl, valid_dl, test_dl = dls['train'], dls['valid'], dls['test']

In [10]:
# batch = next(iter(train_dl))
# x, y = batch['x'], batch['y']
# x.shape, y.shape

# Lightning Models 

In [11]:
from lemoncake.model import *
from pytorch_lightning import Trainer, seed_everything

```python
model = MultimodalBERT(
    train_ds.get_pos_weights(),
    val_ds.get_pos_weights(),
    hidden=384,
    n_layers=6,
    attn_heads=6,
)
```

In [12]:
model = MultimodalBERT(
    train_ds.get_pos_weights(),
    val_ds.get_pos_weights(),
    hidden=32,
    n_layers=2,
    attn_heads=1,
)

In [13]:
batch = train_ds[0:10]

In [14]:
# x, y = batch["x"], batch["y"]
# x.shape, y.shape, x.device, y.device

In [15]:
batch = next(iter(train_dl))
x, y = batch['x'], batch['y']
x.shape, y.shape

(torch.Size([32, 4041]), torch.Size([32, 13]))

In [42]:
model.to(x.device)
model.device

device(type='cpu')

In [43]:
y_hat = model(x)
y_hat.shape

torch.Size([32, 13])

In [44]:
y.shape#, y

torch.Size([32, 13])

In [45]:
y_hat.shape#, y_hat

torch.Size([32, 13])

In [46]:
nn.BCEWithLogitsLoss(pos_weight=train_ds.get_pos_weights())(y_hat, y)

tensor(1.3340, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

## Initializing Weights

- Karpathy's 
    - [Coment about init in the README](https://github.com/karpathy/ng-video-lecture)
    - [nanoGPT init](https://github.com/karpathy/nanoGPT/blob/a82b33b525ca9855d705656387698e13eb8e8d4b/model.py#L147)
- StackExchange - [Is there a proper initialization technique for the weight matrices in multi-head attention?](https://ai.stackexchange.com/questions/30491/is-there-a-proper-initialization-technique-for-the-weight-matrices-in-multi-head)

In [13]:
model

MultimodalBERT(
  (train_loss_fn): BCEWithLogitsLoss()
  (valid_loss_fn): BCEWithLogitsLoss()
  (preprocessor): VectorPreProcessor(
    (linear): Linear(in_features=4041, out_features=98304, bias=True)
  )
  (bert): BERT(
    (pos_encoder): PositionalEncoding1(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=384, out_features=384, bias=True)
          )
          (linear1): Linear(in_features=384, out_features=1536, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1536, out_features=384, bias=True)
          (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): 

### fastai Method

In [35]:
thelist = []

In [40]:
def init_multimodalbert(m, initrange, zero_bn=False):
    """Initialize Multimodal BERT."""

    # if isinstance(m, (nn.Embedding, nn.EmbeddingBag)):
    #     # m.weight.data.uniform_(-initrange, initrange)
    #     thelist.append(f"Initialized {m} with uniform_(-{initrange}, {initrange})")
    if isinstance(m, nn.Linear):
        for name, param in m.named_parameters():
            if "bias" in name:
                # nn.init.constant_(param, 0.0)
                thelist.append(f"Initialized {name} with constant_(0.0)")
            elif "weight" in name:
                # nn.init.kaiming_normal_(param)
                thelist.append(f"Initialized {name} with kaiming_normal_()")
    # if isinstance(m, (nn.BatchNorm1d)):
    #     # nn.init.constant_(m.weight, 0.0 if zero_bn else 1.0)
    #     thelist.append(f"Initialized {m} with constant_(0.0 if {zero_bn} else 1.0)")
    for l in m.children():
        init_multimodalbert(l, initrange, zero_bn)



In [41]:
init_multimodalbert(model, 0.02, zero_bn=True)

In [42]:
len(thelist)

152

### Karpathy Method

In [46]:
    # def _init_weights(self, module):
    #         if isinstance(module, nn.Linear):
    #             # torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    #             nn.init.kaiming_normal_(module.weight)
    #             if module.bias is not None:
    #                 torch.nn.init.zeros_(module.bias)
    #         # elif isinstance(module, nn.Embedding):
    #         #     torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

for name, param in model.named_parameters():
    if name.endswith('proj.weight'):
        print(name, param.shape)

bert.encoder.layers.0.self_attn.out_proj.weight torch.Size([384, 384])
bert.encoder.layers.1.self_attn.out_proj.weight torch.Size([384, 384])
bert.encoder.layers.2.self_attn.out_proj.weight torch.Size([384, 384])
bert.encoder.layers.3.self_attn.out_proj.weight torch.Size([384, 384])
bert.encoder.layers.4.self_attn.out_proj.weight torch.Size([384, 384])
bert.encoder.layers.5.self_attn.out_proj.weight torch.Size([384, 384])


## Train

In [13]:
seed_everything(42, workers=True)
trainer = Trainer(max_epochs=2, precision='16-mixed')

Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
trainer.fit(model, train_dl, valid_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                | Params
------------------------------------------------------
0 | train_loss_fn | BCEWithLogitsLoss   | 0     
1 | valid_loss_fn | BCEWithLogitsLoss   | 0     
2 | preprocessor  | VectorPreProcessor  | 33.1 M
3 | bert          | BERT                | 25.4 K
4 | predictor     | MultiLabelPredictor | 429   
5 | train_metrics | MetricCollection    | 0     
6 | valid_metrics | MetricCollection    | 0     
7 | test_metrics  | MetricCollection    | 0     
------------------------------------------------------
33.1 M    Trainable params
0         Non-trainable params
33.1 M    Total params
132.552   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


#### Big Model

In [None]:
trainer.fit(model, train_dl, valid_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                | Params
------------------------------------------------------
0 | train_loss_fn | BCEWithLogitsLoss   | 0     
1 | valid_loss_fn | BCEWithLogitsLoss   | 0     
2 | preprocessor  | VectorPreProcessor  | 397 M 
3 | bert          | BERT                | 10.6 M
4 | predictor     | MultiLabelPredictor | 5.0 K 
------------------------------------------------------
407 M     Trainable params
0         Non-trainable params
407 M     Total params
1,631.986 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [15]:
%load_ext tensorboard

In [16]:
%tensorboard --logdir "./lightning_logs/"

# Open Issues / Questions

1. train, valid, test split
2. What to do about "-1" in labels
    - currently doing `y = y.fillna(0).replace(-1, 0)`
    - i.e. replacing `NaN` and `-1` with zeros
    - According to the HAIM paper - `-1` is not determined and they have eliminated everything other than 1 and 0 in their training.
3. Model size options