# Bert

### Bert 소개

* Transformer의 encoder 부분만 활용

* NLP 분야에 Fine-Tuning 개념 도입
* Masked Language Model[MLM] 뿐만아니라 Next Sentence Prediction[NSP]를 통해 학습

## JointEmbedding 
Bert Embedding 종류는 세가지

* Token Embeddings : token을 indices로 변경

* Segment Embeddings : 2개 문장의 단어를 구분하기 위해 0,1로 표시 ex) [0,0,0, ... 1,1,1]

* Position Embeddings : 전체 단어의 순번 

  <img alt='img0' src='./img/img0.png' style="width : 400px">

In [32]:
import torch
from torch import nn

class JointEmbedding(nn.Module) : 

    def __init__(self, vocab_size, size, device='cpu') :
        super().__init__()
        self.size = size
        self.device = device

        self.token_emb = nn.Embedding(vocab_size, size)
        self.segment_emb = nn.Embedding(vocab_size, size)

        self.norm =  nn.LayerNorm(size)

    def forward(self,input_tensor) : 
        # positional embbeding
        pos_tensor = self.attention_position(self.size, input_tensor)
        # segment embedding
        segment_tensor = torch.zeros_like(input_tensor).to(self.device)

        # embedding size의 반은 0 반은 1임
        sentence_size = input_tensor.size(-1)
        segment_tensor[:, sentence_size // 2 + 1:] = 1

        output = self.token_emb(input_tensor) + self.segment_emb(segment_tensor) + pos_tensor
        return self.norm(output)

    def attention_position(self,dim,input_tensor) :
        '''
        ????
        '''
        # input_tensor row 크기 
        batch_size = input_tensor.size(0)

        # 문장 길이
        sentence_size = input_tensor(-1)

        # pos 정의 longtype = int64
        pos = torch.arange(sentence_size, dtype=torch.long).to(self.device)

        # d = sentence 내 허용 token 개수
        d = torch.arange(dim, dtype=torch.long).to(self.device)
        d = (2*d /dim)

        # unsqueeze 공부해야할듯..
        pos = pos.unsqueeze(1)
        pos = pos / (1e4**d)

        pos[:, ::2] = torch.sin(pos[:, ::2])
        pos[:, 1::2] = torch.cos(pos[:, 1::2])

        # *pos는 처음 보는 방식인데
        return pos.expand(batch_size, *pos.size())

# 
    def numeric_position(self,dim,input_tensor) : 
        pos_tensor = torch.arange(dim,dtype=torch.long).to(self.device)
        return pos_tensor.expand_as(input_tensor)


    



### Bert 논문 기본 parameter
1. Encoder = 12
2. heads = 12
3. Hidden Layer(=embedding size) = 768
4. word piece = 30522(30522개 단어라는 말)
5. Parameter = 110M


### 110M parameter 계산하기 
* 30522*768 = 24M(embedding 단어)
* 12 encoder = 84M
    - 1 encoder = 7M
    - 세부사항은 상세링크 보기
* Dense Weight Matrix and Bias [768, 768] = 589824, [768] = 768, (589824 + 768 = 590592)
= 110M

    <a href='https://stackoverflow.com/questions/64485777/how-is-the-number-of-parameters-be-calculated-in-bert-model'>상세 링크</a>


In [33]:
import torch.nn

class Bert(nn.Module) : 
    def __init__(self,vocab_size,dim_input,dim_output, attention_heads = 12) -> None:
        '''
        vocab_size : input vocab total
        dim_input : (=hidden_layer= embedding_size) 768 
        dim_output : (=hidden_layer= embedding_size) 768
        '''
        super().__init__()
        self.embedding = JointEmbedding(vocab_size,dim_input)
        self.transformerEndoerLayer = nn.TransformerEncoderLayer(d_model=dim_input,nhead=attention_heads,activation='gelu')
        # bert Base 12 layer 
        self.transformerEncoder = nn.TransformerEncoder(self.transformerEndoerLayer,12)
        self.token_prediction_layer = nn.Linear(dim_input,vocab_size)
        
        self.softmax = nn.LogSoftmax(dim=-1)

        # 0 or 1 classification으로 변환
        self.classification_layer = nn.Linear(dim_input,2)

    def forward(self, input_tensor, attention_mask) : 
        embedded = self.embedding(input_tensor)
        encoded = self.transformerEncoder(input_tensor,attention_mask)

        token_predictions = self.token_prediction_layer(encoded)

        # 모든 행의 첫번째 단어(embedding)
        first_word = encoded[:, 0, :]

        return self.softmax(token_predictions), self.classification_layer(first_word)

    

In [34]:
### First_word 표현 방식 이해용
import torch.nn
import torch

a = torch.arange(0,100).reshape((4,5,5))


a[:,0,:]


tensor([[ 0,  1,  2,  3,  4],
        [25, 26, 27, 28, 29],
        [50, 51, 52, 53, 54],
        [75, 76, 77, 78, 79]])

## 모델 훈련 

In [35]:
from multiprocessing.sharedctypes import Value
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
import time
# dataset_load
import dataset

class BertTrainer : 
    def __init__(
        self, 
        model, 
        dataset, 
        log_dir, 
        checkpoint_dir, 
        print_progress =10, 
        print_accuracy = 50,
        batch_size=24, 
        learning_rate = 0.005, 
        epochs = 5,
        device = 'cpu'
        ):
        self.model = model  
        self.dataset = dataset  
        self.device = device
  
        self.batch_size = batch_size  
        self.epochs = epochs  
        self.current_epoch = 0 

        self.loader = DataLoader(self.dataset, batch_size=self.batch_size,shuffle=True) 
        self.writer = SummaryWriter(str(log_dir))  
        self.checkpoint_dir = checkpoint_dir  

        # NSP 용        
        # This loss combines a Sigmoid layer and the BCELoss in one single class.
        # BCE = Binary Cross Entropy    
        self.criterion = nn.BCEWithLogitsLoss().to(self.device)  

        # MLM 용
        # The negative log likelihood loss. It is useful to train a classification problem with C classes.
        self.ml_criterion = nn.NLLLoss(ignore_index=0).to(self.device)  
        self.optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.015)
    
    def train(self, epoch) :
        print(f"epoch 시작 {epoch}")

        prev = time.time()

        # gradient set
        average_nsp_loss = 0
        average_mlm_loss = 0

        # Gradient Descent 시작 
        for i, value in enumerate(self.loader) :
            index = i + 1
            inp, mask, inverse_token_mask, token_target, nsp_target = value
            self.optimizer.zero_grad()

            token,nsp = self.model(inp,mask)

            tm = inverse_token_mask.unsqueeze(-1).expand_as(token)
            token = token.masked_fill(tm,0)

            
            loss_token = self.ml_criterion(token.transpose(1,2), token_target)
            loss_nsp = self.criterion(nsp, nsp_target)

            loss = loss_token + loss_nsp
            average_mlm_loss += loss_nsp
            average_nsp_loss += loss_token


            loss.backword()
            self.optimizer.step()

            if index % self._print_every == 0:  
                elapsed = time.gmtime(time.time() - prev)  
                s = self.training_summary(elapsed, index, average_nsp_loss, average_mlm_loss)  
  
            if index % self._accuracy_every == 0:  
                s += self.accuracy_summary(index, token, nsp, token_target, nsp_target)  
  
            print(s)  

            # Gradient를 Reset 하는 이유 
            # we typically want to explicitly set the gradients to zero 
            # before starting to do backpropragation 
            # (i.e., updating the Weights and biases) 
            # because PyTorch accumulates the gradients on subsequent backward passes. 

            average_nsp_loss = 0
            average_mlm_loss = 0

        return loss

    def nsp_accuracy(result: torch.Tensor, target: torch.Tensor):
        # argmax(1) = 2차원에서 가장 큰 값 
        s = (result.argmax(1) == target.argmax(1)).sum()  
        return round(float(s / result.size(0)), 2)

    def token_accuracy(result: torch.Tensor, target: torch.Tensor, inverse_token_mask: torch.Tensor):
        r = result.argmax(-1).masked_select(~inverse_token_mask)  
        t = target.masked_select(~inverse_token_mask)  
        s = (r == t).sum()  
        return round(float(s / (result.size(0) * result.size(1))), 2)




### Torch.argmax(1)의미
* 
* argmax 중 2차원을 기준으로 가장 큰 값 찾기

In [36]:
import torch

a = torch.randint(0,100,(3,4))

print(a)
print(a.argmax(1))



tensor([[ 6, 71, 84, 83],
        [66, 33, 85, 77],
        [14, 79,  4, 75]])
tensor([2, 2, 1])


In [37]:
a = dataset.IMDBBertData("./data/IMDB Dataset.csv", should_include_text=True)

a.vocab(["here", "is", "the", "example"])


NameError: name 'pd' is not defined

In [None]:

BertTrainer(Bert,dataset,'./data/','./data/')

TypeError: object of type 'module' has no len()