# Multimodal BERT

- From scratch code from KAIST Pr4AI - ai504_12_bert_sol.ipynb
- Remove embedding matrix, vocab etc.

In [1]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.nn import LayerNorm
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.optim import Adam

import math
import tqdm
import random
import pickle
from collections import Counter

import warnings
warnings.filterwarnings(action='ignore')

### Resources

- https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
- https://coaxsoft.com/blog/building-bert-with-pytorch-from-scratch


# Transformer Dims Math

### # of attention heads & hidden width
- `hidden` or `d_model` mod `att_heads` must be 0, i.e. `d_model` must be a multiple of `# of heads`
- `hidden % attn_heads (12) = 0`
- `hidden` or `d_model` is the width of the former embedding vector. Each token will go into the Transformer as vector of this width.

In [2]:
12 * 32, 12 * 64, 12 * 128      # 12 heads and hidden size as multiples of 32, 64, 128
#GPT-1 = 12 layers and 12 heads, 64 dimensional states = 768 hidden size

(384, 768, 1536)

In [3]:
6 * 32, 6 * 64, 6 * 128        # 6 heads and hidden size as multiples of 32, 64, 128

(192, 384, 768)

- A typical language model has `seq_len` (# of words in each inout) of 512
- So typical sizes for each input in batch would be:
    - for 512 tokens each 768 wide - input to Transformer = `512 * 768 = 393,216` 
    - for 256 tokens each 768 wide - `512 * 768 = 196,608` 
- Overall input dim = `bs x seq_len x emb_width` 

## BERT From Scratch

In [2]:
class Attention(nn.Module):
    """
    Compute 'Scaled Dot Product Attention
    """

    def forward(self, query, key, value, mask=None, dropout=None):                         # query, key, value: (B, h, seq_len, d_k) eg.(B, 12, seq_len, 64)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))    # torch.transpose(input, dim0, dim1): The given dimensions dim0 and dim1 are swapped.
                                                                                           # scores: (B, h, seq_len, seq_len) 
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)                        # masked_fill(mask, value): Fills elements of self tensor with value where mask is True. 
                                                                               
        p_attn = F.softmax(scores, dim=-1)                                      # p_attn: (B, h, seq_len, seq_len) 

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn                              # torch.matmul(p_attn, value): (B, h, seq_len, d_k), p_attn: (B, h, seq_len, seq_len)
                                                                                

class MultiHeadedAttention(nn.Module):
    """
    Take in model size and number of heads.(d_model, h)
    """

    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0                                                 

        # We assume d_v always equals d_k
        self.d_k = d_model // h                                                 # eg. d_model = 768, h = 12, d_k = 64
        self.h = h

        self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)])  # 3 linear layer. WQ, WK, WV   eg.(768, 768(64 x 12))
        self.output_linear = nn.Linear(d_model, d_model)                        # 768->768
        self.attention = Attention()

        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):                            # input query, key, value will be x with (B, seq_len, d_model). 
        batch_size = query.size(0)

        # 1) Do all the linear projections in batch from d_model -> h x d_k     eg. 768 -> (12, 64)
        query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2)  # l(x): (B, seq_len, d_model) -> (B, seq_len, h, d_k) -> (B, h, seq_len, d_k) 
                             for l, x in zip(self.linear_layers, (query, key, value))]    # output: (B, h, seq_len, d_k)  eg.(B, 12, seq_len, 64)

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout)     # x:(B, h, N, d_k) eg.(B, 12, seq_len, 64), attn:(B, h, seq_len, seq_len) eg.(B, 12, seq_len, seq_len)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)    # x:(B, h, seq_len, d_k) -> (B, seq_len, h, d_k)  -> (B, seq_len, h * d_k)  eg.(B, seq_len, 768)
                                                                                      # When you call contiguous(), it actually makes a copy of tensor so the order of elements would be same as if tensor of same shape created from scratch. Normally you don't need to worry about this. 
                                                                                      # https://stackoverflow.com/questions/48915810/pytorch-contiguous
        return self.output_linear(x)                                                  # final output: (B ,seq_len, d_model) eg.(B, seq_len, 768) 

In [3]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):                                          #  size :  eg. 768
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):                                             # the input 'sublayer' will be Multihead attention or FF
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))                         

In [4]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)                                     # d_ff: d_model x 4,  eg. 768 x 4 = 3072
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()

    def forward(self, x):                                                       # x: (B, seq_len, d_model)
        return self.w_2(self.dropout(self.activation(self.w_1(x))))             # (B, seq_len, d_model)

In [5]:
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """

    def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):           # x: (B, seq_len, hidden)   eg.(B, seq_len, 768)
        x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))    
        x = self.output_sublayer(x, self.feed_forward)                                            
        return self.dropout(x)            # (B, seq_len, hidden)      

#### Old NLP BERT - Delete

```python
class TokenEmbedding(nn.Embedding):                                             # torch.nn.Embedding(vocab_size, embedding_dim, padding_idx)
    def __init__(self, vocab_size, embed_size=512):                             # padding_idx=0 means 0th row in the embeding matrix is 0 vector.
        super().__init__(vocab_size, embed_size, padding_idx=0)


class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_len=512):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()                              # pe: (max_len, d_model)
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)                                  # (max_len, 1)  eg. (512, 1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()   # (d_model//2,) eg. (384,)
        a = position * div_term                                                                   # (max_len, d_model//2)  eg. (512, 384)
        pe[:, 0::2] = torch.sin(a)                            # seq = L[start:stop:step]    0::2 start 0, step_size = 2  
        pe[:, 1::2] = torch.cos(a)                            

        pe = pe.unsqueeze(0)                                                    # pe: (1, max_len, d_model)
        self.register_buffer('pe', pe)                                          # Buffers won’t be returned in model.parameters(), so that the optimizer won’t have a change to update them.
                                                                                # register_buffer(name, tensor)
    def forward(self, x):                         # x: (B, seq_len)
        return self.pe[:, :x.size(1)]             # (1, seq_len, d_model)


class SegmentEmbedding(nn.Embedding):
    def __init__(self, embed_size=512):
        super().__init__(3, embed_size, padding_idx=0)                          #0(padding), 1(first sentence), 2(second sentence)


class BERTEmbedding(nn.Module):
    """
    BERT Embedding which is consisted with under features
        1. TokenEmbedding : normal embedding matrix
        2. PositionalEmbedding : adding positional information using sin, cos
        2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)
        sum of all these features are output of BERTEmbedding
    """

    def __init__(self, vocab_size, embed_size, dropout=0.1):                    # embed_size: eg. 768     
        """
        :param vocab_size: total vocab size
        :param embed_size: embedding size of token embedding
        :param dropout: dropout rate
        """
        super().__init__()
        self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size)
        self.position = PositionalEmbedding(d_model=self.token.embedding_dim)
        self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.embed_size = embed_size

    def forward(self, sequence, segment_label):                                             # sequence: (B, seq_len), segment_label: (B, seq_len)
        x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)    # (B, seq_len, hidden), (1, seq_len, hidden), (B, seq_len, hidden), respectively. Broadcasting;  
        return self.dropout(x)
```


```python
class BERT(nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads

        # paper noted they used 4*hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = hidden * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)

        # multi-layers transformer blocks, deep network
        print(f"h (attn_heads): {attn_heads}, d_model (hidden): {hidden}, d_model%h = {hidden%attn_heads}")
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])

    def forward(self, x, segment_info, skip_emb=False):                                         # x: (B, seq_len),      segment_info: (B, seq_len)
        
        # attention masking for padded token
        # attention mask size: torch.ByteTensor([batch_size, 1, seq_len, seq_len])  Why? Because of 'scores' shape in Attention class: (B, h, seq_len, seq_len)
        # mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)        # (B, seq_len) -> (B, 1, seq_len) -> (B, seq_len, seq_len) -> (B, 1, seq_len, seq_len)

        # embedding the indexed sequence to sequence of vectors
        if not skip_emb:
            x = self.embedding(x, segment_info)

        print('x.shape: ', x.shape)
        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask=None)

        return x
```

### BERT From Scratch (without embedding)

In [6]:
class BERTFromScratch(nn.Module):
    """
    BERT model : Bidirectional Encoder Representations from Transformers.
    """

    def __init__(self, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        """
        :param vocab_size: vocab_size of total words
        :param hidden: BERT model hidden size
        :param n_layers: numbers of Transformer blocks(layers)
        :param attn_heads: number of attention heads
        :param dropout: dropout rate
        """

        super().__init__()
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads

        # paper noted they used 4*hidden_size for ff_network_hidden_size
        self.feed_forward_hidden = hidden * 4

        # embedding for BERT, sum of positional, segment, token embeddings
        # self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden)

        # multi-layers transformer blocks, deep network
        print(f"h (attn_heads): {attn_heads}, d_model (hidden): {hidden}, d_model%h = {hidden%attn_heads}")
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)])

    def forward(self, x):                                         # x: (B, seq_len),      segment_info: (B, seq_len)
        
        # attention masking for padded token
        # attention mask size: torch.ByteTensor([batch_size, 1, seq_len, seq_len])  Why? Because of 'scores' shape in Attention class: (B, h, seq_len, seq_len)
        # mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)        # (B, seq_len) -> (B, 1, seq_len) -> (B, seq_len, seq_len) -> (B, 1, seq_len, seq_len)

        print(f'x.shape from BERT: {x.shape}')
        # running over multiple transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer.forward(x, mask=None)

        return x

In [7]:
bert = BERTFromScratch(hidden=768)
x = torch.randn(2, 10, 768)
# x = torch.tensor([[3., 19, 4, 2, 7, 7, 2, 0, 0, 0],               # (2, 10) 2=Batch_size, 10=seq_len 
#                   [3, 8, 7, 4, 2, 5, 6, 8, 2, 0]])
# segment_info = torch.randint(0, 3, (2, 10, 768))
# segment_info = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 0, 0, 0],     # (2, 10)          
#                              [1, 1, 1, 1, 1, 2, 2, 2, 2, 0]])
y = bert(x)
print('shape of input x:',x.size())
print('shape of output y:', y.size())

h (attn_heads): 12, d_model (hidden): 768, d_model%h = 0
x.shape from BERT: torch.Size([2, 10, 768])
shape of input x: torch.Size([2, 10, 768])
shape of output y: torch.Size([2, 10, 768])


In [8]:
### Total Parameters
params = sum([p.nelement() for p in bert.parameters()])
print(f"Total Parameters: , {params: ,}")      # nelement(): Alias for numel() 

Total Parameters: ,  85,054,464


## BERT PyTorch (Not Scratch) (without embedding)

In [5]:
# Create a BERT model class in PyTorch with 12 encoder layers, 12 attention heads, and hidden size 786
class BERT(nn.Module):
    def __init__(self, hidden=768, n_layers=12, attn_heads=12, dropout=0.1):
        super(BERT, self).__init__()
        self.hidden = hidden
        self.n_layers = n_layers
        self.attn_heads = attn_heads
        self.dropout = dropout

        # BERT consists of a stack of 12 identical encoder layers
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden,
            nhead=attn_heads,
            dim_feedforward=hidden * 4,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

    def forward(self, x):
        # x: [batch_size, seq_len, hidden]

        print(f"x.shape from BERT: {x.shape}")
        x = self.encoder(x)
        return x

## Pretrained feature vectors to BERT input 

### Realigning the vectors

In [10]:
x_test = torch.tensor([[0, 1., 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],           
                       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]])
x_test.shape

torch.Size([2, 15])

In [11]:
# x_test.view(2, 3, 5)
x_test.view(2, 3, 5)

tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.]],

        [[ 0.,  1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.,  9.],
         [10., 11., 12., 13., 14.]]])

### HAIM pretrained vectors to BERT input

In [13]:
modalities = {
    'Modality': ['Demographics', 'Chart events', 'Lab events', 'Procedure events', 'ECG notes', 'Echo notes', 'X-ray dense', 'X-ray multiple dense'],
    'col': ['de', 'ts_ce', 'ts_le', 'ts_pe', 'n_ecg', 'n_ech', 'vd', 'vmd'],
    'width': [6, 99, 242, 110, 768, 768, 1024, 1024]
}

modalities_df = pd.DataFrame(modalities)
modalities_df['percentage'] = (modalities_df.width / modalities_df.width.sum()) * 100
modalities_df

Unnamed: 0,Modality,col,width,percentage
0,Demographics,de,6,0.148478
1,Chart events,ts_ce,99,2.449889
2,Lab events,ts_le,242,5.988617
3,Procedure events,ts_pe,110,2.722098
4,ECG notes,n_ecg,768,19.005197
5,Echo notes,n_ech,768,19.005197
6,X-ray dense,vd,1024,25.340262
7,X-ray multiple dense,vmd,1024,25.340262


- HAIM pretrained vector width = **4,041**
- Number of modalities = **8**

**Options**
1. Consider each modality as a single token and convert it (with individual FF layers) to `emb_width` say `768` or `1536`
    - This will result in `8 * 768 = 6144` or `8 * 1536 = 12,288` per input patient
    - Model only has a seq_len of 8
2. Proportionally scale each modality (with individual FF layers) to add up to a max of `512 * 768 = 393,216` or `768 * 256 = 196,608`
    - Model still only has a seq_len of 8 but the emb_widths add up to look like a normal Transformer
    - Model could be made to use 512 or 256 tokens but will that look similar to option 3?
3. Scale the 4,041 with a FFN to 393,216 or 196,608, then chop that vector into 512 or 256 "tokens"

In [10]:
small_x = pd.read_csv('../dataset/mimiciv/mit_pretrained/small_x.csv')
small_y = pd.read_csv('../dataset/mimiciv/mit_pretrained/small_y.csv')

In [11]:
# pick first 3840 columns from small_x
x_df = small_x.iloc[:, :-1]
x_df

Unnamed: 0,de_0,de_1,de_2,de_3,de_4,de_5,vd_0,vd_1,vd_2,vd_3,...,n_ech_758,n_ech_759,n_ech_760,n_ech_761,n_ech_762,n_ech_763,n_ech_764,n_ech_765,n_ech_766,n_ech_767
0,72.0,1.0,6.0,-1.0,1.0,1.0,0.004280,0.026188,0.263595,0.000158,...,0.008949,-0.101050,0.210530,-0.131684,-0.138715,-0.245273,0.038294,0.984190,-0.270763,0.999857
1,82.0,1.0,7.0,1.0,1.0,1.0,0.012419,0.003249,0.116324,0.000791,...,0.038696,-0.130439,0.102237,0.025674,-0.135884,-0.178948,-0.064230,0.998583,-0.202867,0.999945
2,69.0,0.0,7.0,1.0,1.0,1.0,0.000000,0.020264,0.444775,0.032252,...,0.039694,-0.141062,-0.005484,-0.036408,-0.018071,-0.271337,0.008223,0.972547,-0.268833,0.999649
3,70.0,1.0,7.0,1.0,1.0,2.0,0.000000,0.080409,0.463351,0.000000,...,0.003432,-0.111446,0.232872,-0.162072,-0.080258,-0.307298,-0.025907,0.960139,-0.339089,0.999677
4,73.0,0.0,7.0,1.0,1.0,1.0,0.000000,0.086402,0.225096,0.003878,...,0.064002,-0.164014,-0.018618,-0.031740,-0.092928,-0.242412,0.084629,0.995608,-0.322973,0.999862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,70.0,1.0,2.0,1.0,1.0,1.0,0.000000,0.023259,0.323456,0.002986,...,0.146055,-0.193611,0.021812,-0.045600,-0.126873,-0.155373,-0.026748,0.996968,-0.111226,0.999883
66,56.0,1.0,7.0,2.0,1.0,1.0,0.000354,0.152932,0.241619,0.011037,...,0.104565,-0.132154,0.054910,0.014190,-0.041957,-0.229654,-0.031166,0.998506,-0.178801,0.999936
67,55.0,0.0,2.0,2.0,1.0,1.0,0.002057,0.002398,0.132481,0.013952,...,0.134478,-0.053509,0.013095,-0.077658,-0.158625,-0.060205,-0.057243,0.987958,-0.210916,0.999612
68,52.0,0.0,2.0,2.0,1.0,2.0,0.000727,0.006700,0.110596,0.004311,...,-0.013853,-0.159797,0.033823,-0.014833,-0.040662,-0.189048,-0.094771,0.986687,-0.243513,0.999846


In [12]:
x = torch.FloatTensor(x_df.loc[:5].values)
x.shape

torch.Size([6, 4041])

## Multimodal BERT Class

In [24]:
class MultimodalBERT(nn.Module):
    """
    Multimodal BERT
    Next Sentence Prediction Model + Masked Language Model
    """

    def __init__(
        self, vector_width=4041, seq_len=256, hidden=768, n_layers=12, attn_heads=12, dropout=0.1
    ):
        """
        :param bert: BERT model which should be trained
        """

        super().__init__()
        self.preprocessor = VectorPreProcessor(vector_width=vector_width, hidden=hidden, seq_len=seq_len)
        self.bert = BERT(hidden=hidden, n_layers=n_layers, attn_heads=attn_heads, dropout=dropout)
        self.predictor = MultiLabelPredictor(hidden=hidden)

    def forward(self, x):  # x: (B, 4041)
        x = self.preprocessor(x)  # x: (B, 256, 768)
        x = self.bert(x)  # x: (B, seq_len, hidden)
        x = self.predictor(x)  # x: (B, 14)
        return x  # x: (B, 14)


class VectorPreProcessor(nn.Module):
    """
    Preprocesses pretrained vectors before sending to BERT
    """

    def __init__(self, vector_width=4041, hidden=768, seq_len=256):
        """
        :param vector_width: pretrained vector width
        :param hidden: BERT model hidden size
        :param seq_len: BERT model sequence length
        """
        super().__init__()
        self.hidden = hidden
        self.seq_len = seq_len
        self.linear = nn.Linear(
            vector_width, hidden * seq_len
        )  # 4041 -> 768*256 = 196,608

    def forward(self, x):
        print(f"x.shape from VectorPreProcessor: {x.shape}")
        x = self.linear(x)  # 4041 -> 768*256 = 196,608
        return x.view(x.shape[0], self.seq_len, self.hidden)  # 196,608 -> (B, 256, 768)


class MultiLabelPredictor(nn.Module):
    """
    14-class multi label classification model on top of BERT
    """

    def __init__(self, hidden):
        """
        :param hidden: BERT model output size
        """
        super().__init__()
        self.linear = nn.Linear(hidden, 13)
        # self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, x):
        print(f"x.shape from MultiLabelPredictor: {x.shape}")
        return self.linear(x[:, 0])  # x[:, 0]: (B, hidden) -> (B, 14)

#### BERT From Scratch

In [36]:
bert_scratch = BERTFromScratch()                           # hidden=768, n_layers=12, attn_heads=12, dropout=0.1
mm_bert_1 = MultimodalBERT(bert_scratch).cuda()
x = x.cuda()

%time y = mm_bert_1(x)

print('shape of input x:',x.size())
print('shape of output y:', y.size())

h (attn_heads): 12, d_model (hidden): 768, d_model%h = 0
x.shape from VectorPreProcessor: torch.Size([6, 4041])
x.shape from BERT: torch.Size([6, 256, 768])
x.shape from MultiLabelPredictor: torch.Size([6, 256, 768])
CPU times: user 12.6 ms, sys: 206 µs, total: 12.8 ms
Wall time: 12.7 ms
shape of input x: torch.Size([6, 4041])
shape of output y: torch.Size([6, 14])


In [37]:
print(f"{torch.cuda.memory_allocated():,}")

5,428,692,992


In [38]:
mm_bert_1.cpu()

MultimodalBERT(
  (preprocessor): VectorPreProcessor(
    (linear): Linear(in_features=4041, out_features=196608, bias=True)
  )
  (bert): BERTFromScratch(
    (transformer_blocks): ModuleList(
      (0-11): 12 x TransformerBlock(
        (attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0-2): 3 x Linear(in_features=768, out_features=768, bias=True)
          )
          (output_linear): Linear(in_features=768, out_features=768, bias=True)
          (attention): Attention()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=768, out_features=3072, bias=True)
          (w_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation): GELU(approximate='none')
        )
        (input_sublayer): SublayerConnection(
          (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    

In [39]:
print(f"{torch.cuda.memory_allocated(): ,}")

 2,249,455,104


#### BERT 

In [40]:
bert = BERT()                                       # hidden=768, n_layers=12, attn_heads=12, dropout=0.1
mm_bert_2 = MultimodalBERT(bert).to("cuda")                    
x = x.to("cuda")

%time y = mm_bert_2(x)

print('shape of input x:',x.size())
print('shape of output y:', y.size())

x.shape from VectorPreProcessor: torch.Size([6, 4041])
x.shape from BERT: torch.Size([6, 256, 768])
x.shape from MultiLabelPredictor: torch.Size([6, 256, 768])
CPU times: user 9.31 ms, sys: 458 µs, total: 9.77 ms
Wall time: 9.64 ms
shape of input x: torch.Size([6, 4041])
shape of output y: torch.Size([6, 14])


In [22]:
def count_parameters(bert, mm_bert):
    ### Count parameters and difference between BERT and Multimodal BERT
    bert_params = sum([p.nelement() for p in bert.parameters()])
    mm_bert_params = sum([p.nelement() for p in mm_bert.parameters()])
    print(f"Total Parameters in BERT: , {bert_params: ,}")      # nelement(): Alias for numel() 
    print(f"Total Parameters in Multimodal BERT: , {mm_bert_params: ,}")
    print(f"Difference: {mm_bert_params - bert_params: ,} or {mm_bert_params // bert_params}X more params")

In [42]:
count_parameters(bert_scratch, mm_bert_1)

Total Parameters in BERT: ,  85,054,464
Total Parameters in Multimodal BERT: ,  879,754,766
Difference:  794,700,302 or 10X more params


In [43]:
count_parameters(bert, mm_bert_2)

Total Parameters in BERT: ,  85,054,464
Total Parameters in Multimodal BERT: ,  879,754,766
Difference:  794,700,302 or 10X more params


In [44]:
print(f"PreProcessor: {sum([p.nelement() for p in mm_bert_2.preprocessor.parameters()]): ,}")

PreProcessor:  794,689,536


# Next Steps

**Fusion Options**
- Each modality = 1 token - so seq_len = 8 tokens 
- Proportional scaling
	- Convolutional encoding
	- PCA
- Scale and chop

**Training**
- W&B hyperparam sweep
	- \# of heads
	- seq length
	- \# of encoder layers
	- Fusion option
- Early stopping

In [2]:
import sys
sys.path.append('../')


In [3]:
from lemoncake.data import *

In [4]:
train_ds, val_ds, test_ds = get_datasets()

In [5]:
train_ds.x.shape, val_ds.x.shape, test_ds.x.shape

((43738, 4041), (321, 4041), (991, 4041))

In [6]:
train_ds.y.shape, val_ds.y.shape, test_ds.y.shape

((43738, 13), (321, 13), (991, 13))

In [7]:
dls = get_dataloaders({'train': train_ds, 'val': val_ds, 'test': test_ds}, batch_size=32)

In [8]:
train_dl, val_dl, test_dl = dls['train'], dls['val'], dls['test']

In [9]:
batch = next(iter(train_dl))
x, y = batch['x'], batch['y']

In [10]:
x.shape, y.shape

(torch.Size([32, 4041]), torch.Size([32, 13]))

In [25]:
mimic_bert = MultimodalBERT(           # vector_width=4041, seq_len=256, dropout=0.1
    hidden=384, 
    n_layers=6, 
    attn_heads=6
).to("cuda")

x = x.to("cuda")

In [26]:
%time y = mimic_bert(x)

print('shape of input x:',x.size())
print('shape of output y:', y.size())

x.shape from VectorPreProcessor: torch.Size([32, 4041])
x.shape from BERT: torch.Size([32, 256, 384])
x.shape from MultiLabelPredictor: torch.Size([32, 256, 384])
CPU times: user 19.8 ms, sys: 3.97 ms, total: 23.8 ms
Wall time: 22.8 ms
shape of input x: torch.Size([32, 4041])
shape of output y: torch.Size([32, 13])


In [27]:
count_parameters(mimic_bert.bert, mimic_bert)

Total Parameters in BERT: ,  10,646,784
Total Parameters in Multimodal BERT: ,  407,996,557
Difference:  397,349,773 or 38X more params


# Lightning Models 

In [11]:
from lemoncake.model import *
from pytorch_lightning import Trainer, seed_everything

In [17]:
model = MultimodalBERT(hidden=192, 
    n_layers=6, 
    attn_heads=6)

In [18]:
seed_everything(42, workers=True)
trainer = Trainer(max_epochs=1, precision=16)

Global seed set to 42
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
trainer.fit(model, train_dl, val_dl)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                | Params
-----------------------------------------------------
0 | preprocessor | VectorPreProcessor  | 198 M 
1 | bert         | BERT                | 2.7 M 
2 | predictor    | MultiLabelPredictor | 2.5 K 
-----------------------------------------------------
201 M     Trainable params
0         Non-trainable params
201 M     Total params
805.376   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


# References
- [Sentiment Analysis with BERT and Transformers by Hugging Face using PyTorch and Python](https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/)
- [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
- [L11 Language Models - Alec Radford (OpenAI)](https://www.youtube.com/watch?v=BnpB3GrpsfM)
- [The Illustrated BERT, ELMo, and co.](https://jalammar.github.io/illustrated-bert/)
- [BERT Fine-Tuning Tutorial with PyTorch](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
- [How to Fine-Tune BERT for Text Classification?](https://arxiv.org/pdf/1905.05583.pdf)
- [Huggingface Transformers](https://huggingface.co/transformers/)
- [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)
- [BERT implementation](https://github.com/codertimo/BERT-pytorch)