논문 요약 
* Bert로 Sentence Embedding을 만드는 방법을 소개

* 학습 데이터 유형(Classification, Regression) 별 Sentence Embedding를 생성하는 구조 소개

* 다른 Sentence Embedding 모델(Glove, Universal Sentence Encoder)보다 성능이 우수함을 입증

### 학습 데이터 유형에 맞는 Sentence Embedding 구조 소개

#### Regeression 데이터에 맞는 Sentence Embedding 구조
> 모델을 STS로 Finetuning 하는 방법이 아님, STS 데이터를 어떻게 Sentence Embedding에 넣을 것인지에 대한 내용임

* STS Task는 문장의 유사도(0~5) 범위를 output으로 산출함. 

* 이러한 데이터를 기반으로 학습하기 위해선 아래의 구조가 필요

<img src='img/SBERT_Siamese_Network.png' alt='siamese' width='300px'>

##### Transformers로 모델 불러오기

In [None]:
from transformers import ElectraModel,ElectraTokenizer
model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

##### KorSTS 데이터 불러오기

In [349]:
import pandas as pd

with open('data/KorSTS/sts-train.tsv') as f :
    v = f.readlines()

## from list to dataframe
lst = [i.rstrip('\n').split('\t') for i in v]

data = pd.DataFrame(lst[1:],columns=lst[:1])
data = data[['sentence1','sentence2','score']]
data.columns = ['sen1','sen2','score']
data.head(3)

Unnamed: 0,sen1,sen2,score
0,비행기가 이륙하고 있다.,비행기가 이륙하고 있다.,5.0
1,한 남자가 큰 플루트를 연주하고 있다.,남자가 플루트를 연주하고 있다.,3.8
2,한 남자가 피자에 치즈를 뿌려놓고 있다.,한 남자가 구운 피자에 치즈 조각을 뿌려놓고 있다.,3.8


##### Huggingface Dataset으로 데이터 전환하기

In [None]:
from datasets import Dataset

train_data_set = Dataset.from_pandas(data)

train_data_set[0]

### Pooling Model 만들기

In [303]:
import torch.nn as nn 
import torch


class modelWithPooling(nn.Module) : 
    def __init__(self, model, pooling_type='mean') -> None:
        super().__init__()

        self.model = model # base model
        self.pooling_type = pooling_type # pooling type 선정

    # def forward(self,**kwargs) :
    def forward(self,**kwargs) :
        features = self.model(**kwargs)
        attention_mask = kwargs['attention_mask'] # [batch_size, src_token, embed_size]
        last_hidden_state = features['last_hidden_state'] # [batch_size, src_token, embed_size]

        output_vectors = []
        if self.pooling_type == 'cls':
            '''
            [cls] 부분만 추출
            '''

            cls_token = last_hidden_state[:, 0] # [batch_size, embed_size]
            result = cls_token

        if self.pooling_type == 'max':
            '''
            문장 내 토큰 중 가장 값이 큰 token만 추출
            '''

            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            last_hidden_state[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
            max_over_time = torch.max(last_hidden_state, 1)[0]
            result = max_over_time

        if self.pooling_type == 'mean':
            '''
            문장 내 토큰을 합한 뒤 평균
            '''
            # padding 부분 찾기
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() # [batch_size, src_token, embed_size]
            # padding인 경우 0 아닌 경우 1곱한 뒤 총합
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) # [batch_size, embed_size]

            # 평균 내기위한 token 개수 
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)

            result = sum_embeddings / sum_mask

        #  input.shape : [batch_size, src_token, embed_size] => output.shape : [batch_size, embed_size]
        return {'sentence_embedding' : result}

    

In [304]:
model_with_pooling = modelWithPooling(model=model)

### collator 구현

In [305]:
from torch.utils.data import DataLoader

def smart_batching_collate(batch):
        """
        Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
        Here, batch is a list of tuples: [(tokens, label), ...]

        :param batch:
            a batch from a SmartBatchingDataset
        :return:
            a batch of tensors for the model
        """
        
        num_texts = 2
        text_lst1 = []
        text_lst2 = []
        labels = []

        for example in batch:
            for k,v in example.items():
                if k == 'sen1' :
                    text_lst1.append(v)
                if k == 'sen2' :
                    text_lst2.append(v)
                if k == 'score' :
                    labels.append(float(v))


        labels = torch.tensor(labels)
        

        sentence_features = []
        for items in [text_lst1,text_lst2]:
            tokenized = tokenizer(items,return_tensors='pt',truncation=True,padding=True)
            sentence_features.append(tokenized)

        return sentence_features,labels

#### Data 구조 파악 및 Cosine Similarity Loss Function 이해가 필요한 경우 아래 코드를 활용할 것

In [None]:
# from pprint import pprint
# featured_data = next(iter(DataLoader(train_data_set,batch_size=4,collate_fn=smart_batching_collate)))

# feature, labels = featured_data

# pprint(feature)


# embeddings = [model_with_pooling(**input_data)['sentence_embedding'] for input_data in feature]

# cos_score_transformation=nn.Identity()

# # cosine similiarty
# output = cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))

# # Loss function 정의
# mse = nn.MSELoss()

# # normalization
# labels = (labels/5)

# # Loss 계산
# loss = mse(output, labels.view(-1))

# print(torch.cosine_similarity(embeddings[0], embeddings[1]))
# print(cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1])))

### Custom Trainer 생성하기


In [None]:
from transformers import TrainingArguments, TrainerCallback,Trainer,DataCollatorForLanguageModeling

class Classification_model_trainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.MSE = nn.MSELoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        ##########################

        # cosine Similarity Loss 구현

        ##########################            


        features, score = inputs


        cos_score_transformation=nn.Identity()
        
        # Sentence 1, Sentence 2에 대한 Embedding
        embeddings = [model_with_pooling(**input_data)['sentence_embedding'] for input_data in features]

        # Sentence 1, Sentence 2에 대한 Cosine Similarity 계산
        outputs = cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))

        # label score Normalization
        score = score / 5 # 1 ~ 5 => 0 ~ 1
        
        loss = self.MSE(outputs, score.view(-1))

        return (loss, outputs) if return_outputs else loss



training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    logging_steps=10,
    eval_steps=100,
    num_train_epochs=2,
    remove_unused_columns=False
    )

trainer = Classification_model_trainer(model = model_with_pooling, train_dataset=train_data_set,args=training_args,data_collator=smart_batching_collate)

trainer.train()

### Classification 데이터에 적합한 학습 구조 

<img src='img/SBERT_SoftmaxLoss.png' alt='siamese' width='300px'>

#### KorNLI 불러오기

In [386]:
import pandas as pd

with open('data/KorNLI/snli_1.0_train.ko.tsv') as f :
    v = f.readlines()

## from list to dataframe
lst = [i.rstrip('\n').split('\t') for i in v]

data = pd.DataFrame(lst[1:],columns=lst[:1])
data.columns = ['sen1','sen2','gold_label']
data.head(3)

Unnamed: 0,sen1,sen2,gold_label
0,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 경쟁을 위해 말을 훈련시키고 있다.,neutral
1,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 식당에서 오믈렛을 주문하고 있다.,contradiction
2,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,사람은 야외에서 말을 타고 있다.,entailment


#### 인코딩 gold_label

In [387]:
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}

data['gold_label'] = data['gold_label'].replace(label2int).values

data.head(3)

Unnamed: 0,sen1,sen2,gold_label
0,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 경쟁을 위해 말을 훈련시키고 있다.,2
1,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,한 사람이 식당에서 오믈렛을 주문하고 있다.,0
2,말을 탄 사람이 고장난 비행기 위로 뛰어오른다.,사람은 야외에서 말을 타고 있다.,1


### collator 구현

In [392]:
from torch.utils.data import DataLoader

def smart_batching_collate(batch):
        """
        Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model
        Here, batch is a list of tuples: [(tokens, label), ...]

        :param batch:
            a batch from a SmartBatchingDataset
        :return:
            a batch of tensors for the model
        """
        
        num_texts = 2
        text_lst1 = []
        text_lst2 = []
        labels = []

        for example in batch:
            for k,v in example.items():
                if k == 'sen1' :
                    text_lst1.append(v)
                if k == 'sen2' :
                    text_lst2.append(v)
                if k == 'gold_label' :
                    labels.append(int(v))


        labels = torch.tensor(labels)
        

        sentence_features = []
        for items in [text_lst1,text_lst2]:
            tokenized = tokenizer(items,return_tensors='pt',truncation=True,padding=True)
            sentence_features.append(tokenized)

        return sentence_features,labels

In [393]:
from datasets import Dataset

train_data_set = Dataset.from_pandas(data)

### Custom Trainer 생성하기


In [None]:
class Regression_model_trainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
        sentence_embedding_dimension = self.model.model.config.hidden_size
        num_vectors_concatenated = 3

        self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, 3)
        self.loss_fct = nn.CrossEntropyLoss()


    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None

        ##########################

        #  SoftmaxLoss 구현

        ##########################
        

        features, score = inputs

        embeddings = [model_with_pooling(**input_data)['sentence_embedding'] for input_data in features]

        rep_a, rep_b = embeddings

        vectors_concat = []
        vectors_concat.append(rep_a)
        vectors_concat.append(rep_b)
        vectors_concat.append(torch.abs(rep_a - rep_b))

        features = torch.cat(vectors_concat, 1)
        

        outputs = self.classifier(features)



        if score is not None:
            loss = self.loss_fct(outputs, score .view(-1))
            return loss
        else:
            return embeddings, outputs


training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    logging_steps=10,
    eval_steps=100,
    num_train_epochs=2,
    remove_unused_columns=False
    )

trainer = Regression_model_trainer(model = model_with_pooling, train_dataset=train_data_set,args=training_args,data_collator=smart_batching_collate)

trainer.train()