<a href="https://colab.research.google.com/github/superspray/KOR_DA_2021/blob/main/KOR_DA_2021_DA_cMLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 조건부 MLM 기반의 데이터 증강 (DA-cMLM)




*   사전학습 언어모형에 대해 MLM 과제를 학습시켜 증강에 활용함
*   이 때 학습데이터를 클래스별로 분리하여 각각 모형을 학습시킴으로써 Finetuning 과정에서 기존 데이터의 클래스 정보를 반영할 수 있도록 함
*  분류 성능을 평가하기 위한 지표로는 Micro-F1 및 Macro-F1 점수를 사용





In [None]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!pip install git+https://github.com/ssut/py-hanspell.git
!pip install konlpy
!pip install selenium
!pip install scikit-multilearn --upgrade

!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
!git clone https://github.com/kocohub/korean-hate-speech
!git clone https://github.com/songys/Toxic_comment_data


import pandas as pd
import torch
import tensorflow as tf
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from transformers import ElectraForMaskedLM, BertForMaskedLM
import random
from tqdm.notebook import tqdm
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/content/drive/MyDrive/논문/util/')
import train_final as tr_final

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertForPreTraining, BertPreTrainedModel, BertModel, BertConfig, BertForMaskedLM, BertForSequenceClassification
from sklearn.utils import shuffle
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from transformers import ElectraForMaskedLM, BertForMaskedLM
import numpy as np
from tqdm.notebook import tqdm
from tqdm import tqdm

from utils import *
from bert_cls import train_model
from bert_cls import evaluate

# GPU 사용
device = torch.device("cuda")

# colab에서 selenium을 돌리기 위한 옵션들
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [None]:
tokenizer= AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
punct = "/-'?!.,#$%\'()*+-/:;<=>@&*[\\]^_`{|}~"
inputs = tokenizer(
        punct,
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
# print("Tokens (str) : {}".format([tokenizer.convert_ids_to_tokens(s) for s in inputs['input_ids'].tolist()[0]]))
punct_ids=inputs['input_ids'][0][:37].tolist()
punct_ids.append(1)


def sampling_func(data, sample_pct, seed = 123):

    N = len(data)

    data.reset_index(drop = True, inplace = True)
    if sample_pct <= 1 :
        sample_n = int(len(data)*sample_pct) # integer
    else:
        sample_n = int(sample_pct)

    sample = shuffle(data, random_state = seed,  n_samples = sample_n)

    return sample

class NSMCDataset(Dataset):

  def __init__(self, data_file, label, sample_seed = 123, train_doc = 'document', csv_file = None, full = True, data_sep=',',
               data_ratio = 1, mask_ratio=0.2, seed = 100, label_text = ['부정', '긍정'], punct_ids = punct_ids, #josa_ids = JOSA_ids,
               prepend = False, toxic = False):
    self.data_ratio = data_ratio
    if full:
        self.data = data_file
    else:
        self.data = pd.read_csv(csv_file, sep=data_sep).loc[:,["id", train_doc, label]].dropna(axis=0)
    self.data.drop_duplicates(subset=[train_doc], inplace=True)
    self.label = label
    self.train_doc = train_doc

    self.dataset = pd.DataFrame()

    if toxic:
      for lab in self.data[self.label].unique():
        n_sample = int(toxic_size[toxic_size.cls == lab][data_ratio].item())
        temp = sampling_func(self.data[self.data[self.label] == lab], n_sample, sample_seed)
        self.dataset = pd.concat([self.dataset, temp], axis = 0).reset_index(drop = True)

    else:
      for lab in self.data[self.label].unique():
          temp = sampling_func(self.data[self.data[self.label] == lab], data_ratio, sample_seed)
          self.dataset = pd.concat([self.dataset, temp], axis = 0).reset_index(drop = True)

    # print(self.dataset[self.label].value_counts())
    self.seed = seed
    # self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
    self.mask_ratio = mask_ratio
    self.label_text = label_text
    self.punct_ids = punct_ids
    # self.josa_ids = josa_ids
    self.prepend = prepend
  def __len__(self):

    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, :]#.values
    id = row['id']
    return id

## NSMC Dataset - dataloader & 모형 학습 및 성능 측정

In [None]:
# training

def train_model1(model, path, device, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, patience = 5):

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5,
                    eps = 1e-8
                    )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
    print("Start training...\n")

    early_stopping = EarlyStopping(patience = patience, verbose = True, path = path)


    loss_fn = nn.CrossEntropyLoss() # Multi class


    for epoch_i in range(epochs):

        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)


        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts, train_accuracy = 0, 0, 0, []

        model.train()


        for step, batch in enumerate(train_dataloader):

            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs= model(b_input_ids,
                            # token_type_ids=None,
                            attention_mask=b_attn_mask,
                            # labels = b_labels)
                            labels=b_labels.float())#.to(torch.float64))
            loss = outputs[0]
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()

            logits = outputs[1]

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # 그래디언트를 통해 가중치 파라미터 업데이트
            optimizer.step()
            scheduler.step()


        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # ========================================
        #               Validation
        # ========================================
        if evaluation == True:

            val_loss, val_accuracy = evaluate1(model, val_dataloader, device)[:2]

            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        else:
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {'-':^10}| {'-':^10}| {time_elapsed:^9.2f}")
            print("-"*70)

        print("\n")

        early_stopping(val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break
.
    model.load_state_dict(torch.load(path))

    print("Training complete!")
    return model


def evaluate1(model, val_dataloader, device, test_mode = False):

    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    all_logits = []
    targets = []
    outputs = []

    loss_fn = nn.MSELoss()

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)[0].cpu()

        # Compute loss
        loss = loss_fn(logits.reshape(-1) , b_labels.cpu().reshape(-1) )
        val_loss.append(loss.item())

        # Get the predictions
        p_exp = np.exp(logits)/(1+np.exp(logits))
        preds = np.array(p_exp.cpu().detach()) >= 0.5 # for multi label
        # Calculate the accuracy rate

        accuracy = metrics.f1_score(b_labels.cpu().reshape(-1) , preds.reshape(-1) , average='micro')
        val_accuracy.append(accuracy)
        outputs.append(preds.reshape(-1) )
        targets.append(b_labels.cpu().reshape(-1) )

    outputs = np.concatenate(outputs)
    targets = np.concatenate(targets)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    micro_f1 = metrics.f1_score(outputs, targets, average= "micro")
    macro_f1 = metrics.f1_score(outputs, targets, average= "macro")

    if test_mode:
        print(metrics.classification_report(outputs, targets, digits=3))
        print("Micro F1 score: {0:.3f}".format(metrics.f1_score(outputs, targets, average="micro")))
        print("Macro F1 score: {0:.3f}".format(metrics.f1_score(outputs, targets, average="macro")))

    return val_loss, micro_f1, macro_f1


def final1(train, test, path, model, device, train_label= 'document', test_label='comments', epochs = 2, data_type = "beep_hate"):
    train_data, validation_data, train_dataloader, validation_dataloader, test_dataloader = \
    	data_loader1(train, test, train_label=train_label, test_label=test_label, data_type = data_type)

    set_seed(42)
    t0 = time.time()
    model_trained = train_model1(model, path, device, train_dataloader, validation_dataloader, epochs=epochs, evaluation = True)
    test_result = evaluate1(model_trained, test_dataloader, device, test_mode =  True)

    print("Loss: {0:.3f}".format(test_result[0]))
    print("Test took: {:}".format(format_time(time.time() - t0)))

    full_result = test_result[0], test_result[1], model

    return test_result # val_loss, micro_f1, macro_f1





## BEEP Dataset - dataloader & 모형 학습 및 성능 측정


In [None]:
# For BEEP
def train_model2(model, path, device, train_dataloader, val_dataloader=None, epochs=4, evaluation=False, patience = 5):

    optimizer = AdamW(model.parameters(),
                    lr = 2e-5,
                    eps = 1e-8
                    )
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
    print("Start training...\n")

    early_stopping = EarlyStopping(patience = patience, verbose = True, path = path)


    loss_fn = nn.CrossEntropyLoss()

    for epoch_i in range(epochs):

        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)


        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts, train_accuracy = 0, 0, 0, []

        model.train()


        for step, batch in enumerate(train_dataloader):

            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            outputs= model(b_input_ids,
                            # token_type_ids=None,
                            attention_mask=b_attn_mask,
                            # labels = b_labels)
                            labels=b_labels.to(torch.float64))
            loss = outputs[0]
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()

            logits = outputs[1]
            preds = np.array(logits.cpu().detach()) >= torch.argmax(logits.cpu()).item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()


        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        # avg_train_acc =  np.mean(train_accuracy)

        # ========================================
        #               Validation
        # ========================================
        if evaluation == True:

            val_loss, val_accuracy = evaluate2(model, val_dataloader, device)[:2]

            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        else:
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {'-':^10}| {'-':^10}| {time_elapsed:^9.2f}")
            print("-"*70)

        print("\n")

        early_stopping(val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

    model.load_state_dict(torch.load(path))

    print("Training complete!")
    return model


def evaluate2(model, val_dataloader, device, test_mode = False):

    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    all_logits = []
    targets = []
    outputs = []

    loss_fn = nn.CrossEntropyLoss() #


    for batch in val_dataloader:

        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)[0]#.cpu()

        loss = loss_fn(logits, torch.argmax(b_labels.to(torch.float64).long(),1).reshape(-1) )
        val_loss.append(loss.item())

        preds = torch.argmax(logits.cpu().detach(),1).reshape(-1)

        accuracy = metrics.f1_score(torch.argmax(b_labels.cpu(),1).reshape(-1) , preds, average='micro')
        val_accuracy.append(accuracy)
        outputs.append(preds)
        targets.append(torch.argmax(b_labels.cpu(),1).reshape(-1))


    outputs = np.concatenate(outputs)
    targets = np.concatenate(targets)

    val_loss = np.mean(val_loss)
    micro_f1 = metrics.f1_score(outputs, targets, average= "micro")
    macro_f1 = metrics.f1_score(outputs, targets, average= "macro")

    if test_mode:
        print(metrics.classification_report(outputs, targets, digits=3))
        print("Micro F1 score: {0:.3f}".format(metrics.f1_score(outputs, targets, average="micro")))
        print("Macro F1 score: {0:.3f}".format(metrics.f1_score(outputs, targets, average="macro")))

    return val_loss, micro_f1, macro_f1



def final2(train, test, path, model, device, train_label= 'document', test_label='comments', epochs = 2, data_type = "beep_hate"):
    train_data, validation_data, train_dataloader, validation_dataloader, test_dataloader = \
    	data_loader2(train, test, train_label=train_label, test_label=test_label, data_type = data_type)

    set_seed(42)
    t0 = time.time()
    model_trained = train_model2(model, path, device, train_dataloader, validation_dataloader, epochs=epochs, evaluation = True)
    test_result = evaluate2(model_trained, test_dataloader, device, test_mode =  True)

    # print("")
    # print("Accuracy: {0:.3f}".format(test_result[1]))
    print("Loss: {0:.3f}".format(test_result[0]))
    print("Test took: {:}".format(format_time(time.time() - t0)))

    full_result = test_result[0], test_result[1], model

    return test_result # val_loss, micro_f1, macro_f1

