<a href="https://colab.research.google.com/github/teamgaon/KLUE/blob/main/220227_sm_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 라이브러리, 패키지

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install datasets transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!pip install transformers

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[?25l[K     |█                               | 10 kB 24.3 MB/s eta 0:00:01[K     |██                              | 20 kB 10.4 MB/s eta 0:00:01[K     |███▏                            | 30 kB 9.3 MB/s eta 0:00:01[K     |████▏                           | 40 kB 8.5 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 4.6 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 5.4 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 5.5 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 4.2 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 4.7 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 5.2 MB/s eta 0:00:01[K     |███████████▋                    | 112 kB 5.2 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 5.2 MB/s eta 0:00:01[K     |█████████████▊                  | 133 kB 5.2 MB/s eta 0:00:01

In [None]:
from accelerate import Accelerator

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random
from tqdm import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from transformers import TrainingArguments, Trainer
from transformers import AutoModel,AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
import gc
from transformers import AdamW
from transformers import get_scheduler, get_cosine_with_hard_restarts_schedule_with_warmup
from tqdm.auto import tqdm
from datasets import load_metric
import warnings
warnings.filterwarnings('ignore')
from accelerate import notebook_launcher

# 데이터

In [None]:
PATH =  '/content/drive/MyDrive/KLUE'

# 데이콘 데이터셋
train = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding='utf-8')
test = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

# 카카오 데이터셋
kakao_dev = pd.read_csv('/content/drive/MyDrive/KLUE/xnli.dev.ko.tsv', sep='\t', encoding='utf-8')
kakao_test = pd.read_csv('/content/drive/MyDrive/KLUE/xnli.test.ko.tsv', sep='\t', encoding='utf-8')
kakao_dev = pd.concat([kakao_dev,kakao_test])
kakao_dev.rename(columns = {'sentence1':'premise','sentence2':'hypothesis','gold_label':'label'},inplace=True)

# KLUE 데이터셋
klue_dev = pd.read_json('/content/drive/MyDrive/KLUE/klue-nli-v1.1_dev.json')
klue_dev = klue_dev[['premise', 'hypothesis', 'gold_label']]
klue_dev.rename(columns = {'gold_label':'label'}, inplace=True)

# 데이콘, 카카오, KLUE 데이터셋 병합
train = pd.concat([train, klue_dev, kakao_dev], axis=0)
train = train.reset_index(drop=True)
train['index'] = train.index
train = train.dropna()
train = train.reset_index(drop=True)
train['index'] = train.index
train

# 토크나이저, 함수

In [None]:
from transformers import AutoTokenizer

checkpoint = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, label):
        self.pair_dataset = pair_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['labels'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

In [None]:
def label_to_num(label):
    label_dict = {"entailment": 0, "contradiction": 1, "neutral": 2, "answer": 3}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])
    
    return num_label

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# 학습

In [None]:
def training_function():
  accelerator = Accelerator()

  train_dataloader = torch.utils.data.DataLoader(
                    train_dataset, 
                    batch_size=16, sampler=train_subsampler)
  eval_dataloader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size=16, sampler=test_subsampler)

  config = AutoConfig.from_pretrained(checkpoint)
  config.num_labels = 3
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)

  optimizer = AdamW(model.parameters(), lr=	1e-5)

  model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

  num_epochs = 5
  num_training_steps = num_epochs * len(train_dataloader)
  progress_bar = tqdm(range(num_training_steps))
  lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
      optimizer=optimizer,
      num_warmup_steps=1,
      num_training_steps=num_training_steps,
  )

  for epoch in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0

      model.train()
      for batch_id, batch in enumerate(train_dataloader):
          outputs = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])
          loss = F.cross_entropy(outputs[0], batch['labels'])
          accelerator.backward(loss)

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
          train_acc += calc_accuracy(outputs.logits, batch['labels'])
      print("epoch {} train acc {}".format(epoch+1, train_acc / (batch_id+1)))

      model.eval()
      for batch_id, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch['input_ids'], batch['attention_mask'], batch['token_type_ids'])

        test_acc += calc_accuracy(outputs.logits, batch['labels'])
      print("epoch {} test acc {}".format(epoch+1, test_acc / (batch_id+1)))
      gc.collect()
  accelerator.wait_for_everyone()
  unwrapped_model = accelerator.unwrap_model(model)
  unwrapped_model.save_pretrained('/content/drive/MyDrive/220226/model' + str(fold), save_function=accelerator.save)

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)

print('--------------------------------')

tokenized_train = tokenizer(
  list(train['premise']),
  list(train['hypothesis']),
  return_tensors="pt",
  max_length=128, # Max_Length = 190
  padding=True,
  truncation=True,
  add_special_tokens=True
  )
  
for fold, (train_ids, test_ids) in enumerate(kfold.split(train, train['label'])):
  print(f'FOLD {fold}')

  train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
  test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)

  train_label = label_to_num(train['label'].values)

  train_dataset = BERTDataset(tokenized_train, train_label)

  notebook_launcher(training_function)

# 추론

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

test_label = label_to_num(test['label'].values)

tokenized_test = tokenizer(
    list(test['premise']),
    list(test['hypothesis']),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

test_dataset = BERTDataset(tokenized_test, test_label)

dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

for fold in range(5):

  config = AutoConfig.from_pretrained(checkpoint)
  config.num_labels = 3
  model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/220226/model' + str(fold), num_labels=3)
  model.resize_token_embeddings(tokenizer.vocab_size)
  accelerator = Accelerator()
  model = accelerator.unwrap_model(model)

  output_pred = []
  output_prob = []

  model, dataloader= accelerator.prepare(model, dataloader)

  model.eval()

  for i, data in enumerate(tqdm(dataloader)):
      with torch.no_grad():
          outputs = model(
              input_ids=data['input_ids'],
              attention_mask=data['attention_mask']
          )
      logits = outputs[0]
      prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
      logits = logits.detach().cpu().numpy()
      result = np.argmax(logits, axis=-1)
      output_pred.append(result)
      output_prob.append(prob)
      
  pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

  def num_to_label(label):
      label_dict = {0: "entailment", 1: "contradiction", 2: "neutral"}
      str_label = []

      for i, v in enumerate(label):
          str_label.append([i,label_dict[v]])
      
      return str_label

  answer = num_to_label(pred_answer)

  df_label = pd.DataFrame(answer, columns=['index', 'label'])
  df_prob = pd.DataFrame(output_prob)

  df_label.to_csv('/content/drive/MyDrive/220226/pred_label'+str(fold)+'.csv', index=False)
  df_prob.to_csv('/content/drive/MyDrive/220226/pred_prob'+str(fold)+'.csv', index=False)

## softvoting

In [None]:
pred0 = pd.read_csv('/content/drive/MyDrive/220226/pred_prob0.csv')
pred1 = pd.read_csv('/content/drive/MyDrive/220226/pred_prob1.csv')
pred2 = pd.read_csv('/content/drive/MyDrive/220226/pred_prob2.csv')
pred3 = pd.read_csv('/content/drive/MyDrive/220226/pred_prob3.csv')
pred4 = pd.read_csv('/content/drive/MyDrive/220226/pred_prob4.csv')

In [None]:
pred = pd.DataFrame((np.array(pred0) + np.array(pred1) + np.array(pred2) + np.array(pred3) + np.array(pred4))/5)

In [None]:
test = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

In [None]:
test = pd.concat([test, pred], axis=1)

In [None]:
# 0: "entailment", 1: "contradiction", 2: "neutral"
test.loc[(test[2] > 0.1) , 'label'] = 'neutral'
test.loc[(test[0] > 0.1) , 'label'] = 'entailment'
test.loc[(test[1] > 0.1) , 'label'] = 'contradiction'

In [None]:
my_label = []
for index in test[(test[0] > 0.1) & (test[1] > 0.1) & (test[2] > 0.1)].index:
  if test[0].loc[index] > test[1].loc[index] :
    my_label.append('entailment')
  else:
    my_label.append('contradiction')

test.loc[(test[(test[0] > 0.1) & (test[1] > 0.1) & (test[2] > 0.1)].index), 'label'] = my_label

In [None]:
my_label = []
for index in test[(test[0] > 0.1) & (test[1] > 0.1)].index:
  if test[0].loc[index] > test[1].loc[index] :
    my_label.append('entailment')
  else:
    my_label.append('contradiction')

test.loc[(test[(test[0] > 0.1) & (test[1] > 0.1)].index), 'label'] = my_label

In [None]:
test.loc[(test[(test['label'] == 'entailment') & (test[2] > 0.7)].index), 'label'] = 'neutral'

In [None]:
test['label'].value_counts()

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/KLUE/sample_submission.csv')

In [None]:
submission['label'] = test['label']

In [None]:
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/220226/submission_soft.csv', index=False)

## hardvoting

In [None]:
pred0 = pd.read_csv('/content/drive/MyDrive/220226/pred_label0.csv')
pred1 = pd.read_csv('/content/drive/MyDrive/220226/pred_label1.csv')
pred2 = pd.read_csv('/content/drive/MyDrive/220226/pred_label2.csv')
pred3 = pd.read_csv('/content/drive/MyDrive/220226/pred_label3.csv')
pred4 = pd.read_csv('/content/drive/MyDrive/220226/pred_label4.csv')

In [None]:
test['label0'] = pred0['label']
test['label1'] = pred1['label']
test['label2'] = pred2['label']
test['label3'] = pred3['label']
test['label4'] = pred4['label']

In [None]:
df = test
df

In [None]:
def label_count(df):
  num_neutral = 0
  num_contradiction = 0
  num_entailment = 0
  cols = ['label0', 'label1', 'label2', 'label3', 'label4']

  for col in cols:
    if df[col] == 'neutral':
      num_neutral = num_neutral + 1
    if df[col] == 'contradiction':
      num_contradiction = num_contradiction + 1
    if df[col] == 'entailment':
      num_entailment = num_entailment + 1

  return [num_neutral, num_contradiction, num_entailment]

temp = df.apply(label_count, axis=1)
df['temp'] = temp

df['neutral'] = 0
df['contradiction'] = 0
df['entailment'] = 0

def list_to_num(list:list):
  return list[0]
df['neutral'] = df['temp'].map(list_to_num)

def list_to_num(list:list):
  return list[1]
df['contradiction'] = df['temp'].map(list_to_num)

def list_to_num(list:list):
  return list[2]
df['entailment'] = df['temp'].map(list_to_num)

def voting(df):
  cols = ['neutral', 'contradiction', 'entailment']
  for col in cols:
    if df[col] > 2:
      return col
  return 'neutral'

df['label'] = df.apply(voting, axis=1)

In [None]:
def make_label(list:list):
  if list == [3, 2, 0] :
    return 'contradiction'
  if list == [3, 0, 2]:
    return 'entailment'
  if list == [4, 1, 0]:
    return 'contradiction'
  if list == [4, 0, 1]:
    return 'entailment'
  else :
    return 'answer'

my_label = df['temp'].map(make_label)

In [None]:
df['my_label'] = my_label

In [None]:
my_answer = df[df['my_label'] != 'answer']

In [None]:
df.loc[my_answer.index, 'label'] = my_answer['my_label']

In [None]:
df['label'].value_counts()

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/KLUE/sample_submission.csv')

In [None]:
submission['label'] = df['label']

In [None]:
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/220226/submission_hard.csv', index=False)

## soft + hard voting

In [None]:
soft = pd.read_csv('/content/drive/MyDrive/220226/submission_soft.csv')
hard = pd.read_csv('/content/drive/MyDrive/220226/submission_hard.csv')

In [None]:
df = test[['premise',	'hypothesis',	'label',	0,	1,	2,	'label0',	'label1',	'label2',	'label3',	'label4']]

In [None]:
df['soft'] = soft['label']
df['hard'] = hard['label']

In [None]:
df['soft'].value_counts()

neutral          573
entailment       561
contradiction    532
Name: soft, dtype: int64

In [None]:
df['hard'].value_counts()

entailment       568
neutral          555
contradiction    543
Name: hard, dtype: int64

In [None]:
df.loc[df[(df['hard'] != df['soft']) & (df['soft'] == 'neutral')].index, 'label'] = df[(df['hard'] != df['soft']) & (df['soft'] == 'neutral')]['soft']

In [None]:
df.loc[df[(df['hard'] != df['soft']) & (df['soft'] != 'neutral')].index, 'label'] = df[(df['hard'] != df['soft']) & (df['soft'] != 'neutral')]['hard']

In [None]:
df['label'].value_counts()

neutral          576
entailment       563
contradiction    527
Name: label, dtype: int64

In [None]:
submission = pd.read_csv('/content/drive/MyDrive/KLUE/sample_submission.csv')

In [None]:
submission['label'] = df['label']

In [None]:
submission

Unnamed: 0,index,label
0,0,contradiction
1,1,neutral
2,2,entailment
3,3,contradiction
4,4,contradiction
...,...,...
1661,1661,neutral
1662,1662,entailment
1663,1663,neutral
1664,1664,neutral


In [None]:
submission.to_csv('/content/drive/MyDrive/220226/submission_soft_hard.csv', index=False)