<a href="https://colab.research.google.com/github/yongjulee0213/SentenceClassifier/blob/main/klue_robert_base_10_acc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [92]:
import pandas as pd
import numpy as np
import torch
import os
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from tqdm import tqdm

# for graphing
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train_original = pd.read_csv('/train.csv')
train_original.drop(columns=['ID'], inplace=True)
test = pd.read_csv('/test.csv')
test.drop(columns=['ID'], inplace=True)
submission = pd.read_csv('/sample_submission.csv')

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

CFG = {
    'EPOCHS':20,
    'LEARNING_RATE':1e-5,
    'BATCH_SIZE':10,
    'SEED':41
}

seed_everything(CFG['SEED']) # Seed 고정
device = torch.device('cuda')

In [5]:
train, val, _, _ = train_test_split(train_original, train_original['label'], test_size=0.2, random_state=CFG['SEED'])
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)


In [218]:
model_nm = 'klue/roberta-base'
base_model = AutoModel.from_pretrained(model_nm)
tokenizer = AutoTokenizer.from_pretrained(model_nm)

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [219]:
class SentenceTypeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, labels=None):
        texts = dataframe['문장'].values.tolist()

        self.texts = [tokenizer(text, padding='max_length', max_length=90, truncation=True, return_tensors='pt') for text in texts]
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        if self.labels is not None:
            type_tmp = self.labels['type'][idx]
            polarity_tmp = self.labels['polarity'][idx]
            tense_tmp = self.labels['tense'][idx]
            certainty_tmp = self.labels['certainty'][idx]
            return text, torch.Tensor(type_tmp), torch.Tensor(polarity_tmp), torch.Tensor(tense_tmp), torch.Tensor(certainty_tmp)
        else:
            return text, torch.Tensor([-1,-1,-1,-1]), torch.Tensor([-1,-1,-1]), torch.Tensor([-1,-1,-1]), torch.Tensor([-1,-1])

In [220]:
class SentenceClassifier(nn.Module):#task 연결 역할
    def __init__(self, base_model):
        super().__init__()
        self.klue = base_model # from transformers package

        self.fc1 = nn.Linear(768, 10)#large input1024, output1024
        self.relu = nn.ReLU()

        self.type_clf = nn.Linear(10,4)
        self.polarity_clf = nn.Linear(10,3)
        self.tense_clf = nn.Linear(10,3)
        self.certainty_clf = nn.Linear(10,2)

        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # input_ids : token's id / attention_mask : make a model to focus on which token
        klue_out = self.klue(input_ids= input_ids, attention_mask = attention_mask)[0][:,0]

        x = self.fc1(klue_out)#일자로 펴서
        x = self.relu(x)#relu 0보다 작으면 0, 크면 1

        type_output = self.type_clf(x)
        type_output = self.softmax(type_output)#확률값

        polarity_output = self.polarity_clf(x)
        polarity_output = self.softmax(polarity_output)

        tense_output = self.tense_clf(x)
        tense_output = self.softmax(tense_output)

        certainty_output = self.certainty_clf(x)
        certainty_output = self.softmax(certainty_output)

        return type_output, polarity_output, tense_output, certainty_output

In [221]:
def acc(pred,target):# 정확도=맞은 것의 개수/전체개수
  count=0
  for i in range(len(pred)):
    if pred[i].index(max(pred[i]))==target[i].index(max(target[i])):#pred의 제일 높은 값의 인덱스와 target의 인덱스가 같다면
      count+=1#개수세기
  return count#배치 하나에서 몇개를 맞췄는지 리턴함

In [222]:
def sentence_train(model, train_dataloader, val_dataloader, model_nm):
    best_val_loss = 99999999999999 # setting max (act as infinity)
    early_stopping_threshold_count = 0

    type_preds, polarity_preds, tense_preds, certainty_preds = [], [], [], []
    type_labels, polarity_labels, tense_labels, certainty_labels = [], [], [], []
    
    criterion_loss = {
        'type' : nn.CrossEntropyLoss().to(device),
        'polarity' : nn.CrossEntropyLoss().to(device),
        'tense' : nn.CrossEntropyLoss().to(device),
        'certainty' : nn.CrossEntropyLoss().to(device)
    }

    optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])
    model = model.to(device)

    for epoch in range(CFG['EPOCHS']):
        total_acc_train = 0
        total_loss_train = 0
        
        model.train() # sets into the training mode
        
        for train_input, type_label, polarity_label, tense_label, certainty_label in tqdm(train_dataloader):
            attention_mask = train_input['attention_mask'].to(device)
            input_ids = train_input['input_ids'].squeeze(1).to(device)
            type_label = type_label.to(device)
            polarity_label = polarity_label.to(device)
            tense_label = tense_label.to(device)
            certainty_label = certainty_label.to(device)

            optimizer.zero_grad()
            
            type_output, polarity_output, tense_output, certainty_output = model(input_ids, attention_mask) # from the forward function
            
            train_loss = 0.25*criterion_loss['type'](type_output, type_label.float()) + \
                   0.25*criterion_loss['polarity'](polarity_output, polarity_label.float()) + \
                   0.25*criterion_loss['tense'](tense_output, tense_label.float()) + \
                   0.25*criterion_loss['certainty'](certainty_output, certainty_label.float())
            total_loss_train +=train_loss.item()

            #정확도 계산 예측 데이터가 동일한 데이터 건수/ 전체 예측 데이터 건수
            t=acc(type_output.cpu().detach().numpy().tolist(), type_label.cpu().cpu().numpy().tolist())
            p=acc(polarity_output.cpu().detach().numpy().tolist(), polarity_label.cpu().cpu().numpy().tolist())
            te=acc(polarity_output.cpu().detach().numpy().tolist(), polarity_label.cpu().cpu().numpy().tolist())
            c=acc(certainty_output.cpu().detach().numpy().tolist(), certainty_label.cpu().cpu().numpy().tolist())
            train_acc=0.25*(t+p+te+c)#전체 가중치 0.25씩 주고 맞은 개수 전부더해줌

            total_acc_train +=train_acc

            train_loss.backward()
            optimizer.step()


        with torch.no_grad(): # since we should not change gradient for validation 
            total_acc_val = 0
            total_loss_val = 0
            
            model.eval() # deactivate training
            
            # same process as the above
            for val_input, vtype_label, vpolarity_label, vtense_label, vcertainty_label in tqdm(val_dataloader):
                attention_mask = val_input['attention_mask'].to(device)
                input_ids = val_input['input_ids'].squeeze(1).to(device)

                vtype_label = vtype_label.to(device)
                vpolarity_label = vpolarity_label.to(device)
                vtense_label = vtense_label.to(device)
                vcertainty_label = vcertainty_label.to(device)
                
                vtype_output, vpolarity_output, vtense_output, vcertainty_output = model(input_ids, attention_mask) # from the forward function

                val_loss = 0.25*criterion_loss['type'](vtype_output, vtype_label.float()) + \
                        0.25*criterion_loss['polarity'](vpolarity_output, vpolarity_label.float()) + \
                        0.25*criterion_loss['tense'](vtense_output, vtense_label.float()) + \
                        0.25*criterion_loss['certainty'](vcertainty_output, vcertainty_label.float())

                total_loss_val += val_loss.item()

                #정확도
                vt=acc(vtype_output.cpu().detach().numpy().tolist(), vtype_label.cpu().cpu().numpy().tolist())
                vp=acc(vpolarity_output.cpu().detach().numpy().tolist(), vpolarity_label.cpu().cpu().numpy().tolist())
                vte=acc(vpolarity_output.cpu().detach().numpy().tolist(), vpolarity_label.cpu().cpu().numpy().tolist())
                vc=acc(vcertainty_output.cpu().detach().numpy().tolist(), vcertainty_label.cpu().cpu().numpy().tolist())
                val_acc=0.25*(vt+vp+vte+vc)#유형별로 가져온 확률값0.25의 가중치를 줌(배치단위로 더해주고)

                total_acc_val +=val_acc#1배치 끝나면 0이 됨

            print(f'Epochs: {epoch + 1} '
                  f'| Train Loss: {total_loss_train / len(train_dataloader): .3f} '
                  f'| Train Accuracy: {total_acc_train/len(train_dataloader.dataset): .3f} '
                  f'| Val Loss: {total_loss_val / len(val_dataloader): .3f} '
                  f'| Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')#배치단위로 더해준 개수를 전체 데이터셋의 개수로 나누줌
            
            if best_val_loss > total_loss_val:
                best_val_loss = total_loss_val # saving only the best one
                torch.save(model, f"/{model_nm}.pt")
                print("Saved model")
                early_stopping_threshold_count = 0
            else:
                early_stopping_threshold_count += 1 #val loss가 감소하지 않는다면 patience
                
            if early_stopping_threshold_count >= 3: # ==> patience=1
                print("Early stopping")
                break

In [223]:

train_tmp = train[['문장', '유형', '극성', '시제', '확실성']]
train_tmp = pd.get_dummies(train_tmp, columns=['유형', '극성', '시제', '확실성'])#벡터화


train_type = train_tmp.iloc[:,1:5].values.tolist()
train_polarity = train_tmp.iloc[:,5:8].values.tolist()
train_tense = train_tmp.iloc[:,8:11].values.tolist()
train_certainty = train_tmp.iloc[:,11:13].values.tolist()
train_labels = {
    'type': train_type,
    'polarity': train_polarity,
    'tense': train_tense,
    'certainty': train_certainty
}

val_tmp = val[['문장', '유형', '극성', '시제', '확실성']]
val_tmp = pd.get_dummies(val_tmp, columns=['유형', '극성', '시제', '확실성'])

val_type = val_tmp.iloc[:,1:5].values.tolist()
val_polarity = val_tmp.iloc[:,5:8].values.tolist()
val_tense = val_tmp.iloc[:,8:11].values.tolist()
val_certainty = val_tmp.iloc[:,11:13].values.tolist()
val_labels = {
    'type': val_type,
    'polarity': val_polarity,
    'tense': val_tense,
    'certainty': val_certainty
}

In [224]:
train_dataloader = DataLoader(SentenceTypeDataset(train_tmp, tokenizer, train_labels), batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=0) # num_workers: how many subprocesses to use for data loading  
val_dataloader = DataLoader(SentenceTypeDataset(val_tmp, tokenizer, val_labels), batch_size=CFG['BATCH_SIZE'], num_workers=0)

In [225]:
model = SentenceClassifier(base_model)

In [226]:
sentence_train(model, train_dataloader, val_dataloader, 'klue')

100%|██████████| 1324/1324 [03:59<00:00,  5.52it/s]
100%|██████████| 331/331 [00:17<00:00, 18.74it/s]


Epochs: 1 | Train Loss:  0.799 | Train Accuracy:  0.914 | Val Loss:  0.721 | Val Accuracy:  0.914
Saved model


100%|██████████| 1324/1324 [04:01<00:00,  5.49it/s]
100%|██████████| 331/331 [00:17<00:00, 18.51it/s]


Epochs: 2 | Train Loss:  0.687 | Train Accuracy:  0.918 | Val Loss:  0.664 | Val Accuracy:  0.917
Saved model


100%|██████████| 1324/1324 [04:00<00:00,  5.51it/s]
100%|██████████| 331/331 [00:17<00:00, 18.78it/s]


Epochs: 3 | Train Loss:  0.648 | Train Accuracy:  0.924 | Val Loss:  0.643 | Val Accuracy:  0.921
Saved model


100%|██████████| 1324/1324 [03:59<00:00,  5.52it/s]
100%|██████████| 331/331 [00:17<00:00, 18.79it/s]


Epochs: 4 | Train Loss:  0.631 | Train Accuracy:  0.928 | Val Loss:  0.634 | Val Accuracy:  0.925
Saved model


100%|██████████| 1324/1324 [03:59<00:00,  5.52it/s]
100%|██████████| 331/331 [00:17<00:00, 18.89it/s]


Epochs: 5 | Train Loss:  0.622 | Train Accuracy:  0.932 | Val Loss:  0.633 | Val Accuracy:  0.922
Saved model


100%|██████████| 1324/1324 [04:01<00:00,  5.48it/s]
100%|██████████| 331/331 [00:17<00:00, 18.84it/s]


Epochs: 6 | Train Loss:  0.616 | Train Accuracy:  0.934 | Val Loss:  0.629 | Val Accuracy:  0.925
Saved model


100%|██████████| 1324/1324 [04:00<00:00,  5.50it/s]
100%|██████████| 331/331 [00:17<00:00, 18.73it/s]


Epochs: 7 | Train Loss:  0.611 | Train Accuracy:  0.938 | Val Loss:  0.631 | Val Accuracy:  0.924


100%|██████████| 1324/1324 [03:59<00:00,  5.52it/s]
100%|██████████| 331/331 [00:17<00:00, 18.71it/s]


Epochs: 8 | Train Loss:  0.608 | Train Accuracy:  0.940 | Val Loss:  0.629 | Val Accuracy:  0.926
Saved model


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.78it/s]


Epochs: 9 | Train Loss:  0.605 | Train Accuracy:  0.941 | Val Loss:  0.629 | Val Accuracy:  0.925


100%|██████████| 1324/1324 [03:59<00:00,  5.54it/s]
100%|██████████| 331/331 [00:17<00:00, 18.81it/s]


Epochs: 10 | Train Loss:  0.604 | Train Accuracy:  0.941 | Val Loss:  0.627 | Val Accuracy:  0.926
Saved model


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.89it/s]


Epochs: 11 | Train Loss:  0.602 | Train Accuracy:  0.943 | Val Loss:  0.628 | Val Accuracy:  0.926


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.75it/s]


Epochs: 12 | Train Loss:  0.602 | Train Accuracy:  0.943 | Val Loss:  0.627 | Val Accuracy:  0.928
Saved model


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.78it/s]


Epochs: 13 | Train Loss:  0.600 | Train Accuracy:  0.945 | Val Loss:  0.628 | Val Accuracy:  0.926


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.70it/s]


Epochs: 14 | Train Loss:  0.600 | Train Accuracy:  0.945 | Val Loss:  0.631 | Val Accuracy:  0.924


100%|██████████| 1324/1324 [03:59<00:00,  5.53it/s]
100%|██████████| 331/331 [00:17<00:00, 18.69it/s]

Epochs: 15 | Train Loss:  0.599 | Train Accuracy:  0.945 | Val Loss:  0.628 | Val Accuracy:  0.926
Early stopping





In [227]:
def get_type_predictions(model, loader):

    device = torch.device("cuda")
    model = model.to(device)
    
    type_probs, polarity_probs, tense_probs, clarity_probs = [], [], [], []
    with torch.no_grad():
        model.eval()
        for data_input, _, _, _, _ in tqdm(loader):
            attention_mask = data_input['attention_mask'].to(device)
            input_ids = data_input['input_ids'].squeeze(1).to(device)


            type_output, polarity_output, tense_output, clarity_output = model(input_ids, attention_mask)
            type_probs.append(type_output)
            polarity_probs.append(polarity_output)
            tense_probs.append(tense_output)
            clarity_probs.append(clarity_output)
    
    return torch.cat(type_probs).cpu().detach().numpy(), \
            torch.cat(polarity_probs).cpu().detach().numpy(), \
            torch.cat(tense_probs).cpu().detach().numpy(), \
            torch.cat(clarity_probs).cpu().detach().numpy()

In [228]:
model = torch.load("/klue.pt")
test_dataloader = DataLoader(SentenceTypeDataset(test, tokenizer), batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [229]:
test_pred_type, test_pred_polarity, test_pred_tense, test_pred_certainty = get_type_predictions(model, test_dataloader)

100%|██████████| 709/709 [00:37<00:00, 18.76it/s]


In [230]:
test_type = ['대화형' if i==0 else '사실형' if i==1 else '예측형' if i==2 else '추론형' for i in [np.argmax(p) for p in test_pred_type]]
test_polarity = ['긍정' if i==0 else '미정' if i==1 else '부정' for i in [np.argmax(p) for p in test_pred_polarity]]
test_tense = ['과거' if i==0 else '미래' if i==1 else '현재' for i in [np.argmax(p) for p in test_pred_tense]]
test_certainty = ['불확실' if i==0 else '확실' for i in [np.argmax(p) for p in test_pred_certainty]]

In [233]:
label_sum = []
for i in range(len(test_type)):
    label_sum.append(f'{test_type[i]}-{test_polarity[i]}-{test_tense[i]}-{test_certainty[i]}')

submission['label'] = label_sum
submission.to_csv('/klue_robert_base_10_acc.csv', index=False)

In [232]:
submission

Unnamed: 0,ID,label
0,TEST_0000,사실형-긍정-현재-확실
1,TEST_0001,사실형-긍정-현재-확실
2,TEST_0002,사실형-긍정-과거-확실
3,TEST_0003,사실형-긍정-현재-확실
4,TEST_0004,사실형-긍정-과거-확실
...,...,...
7085,TEST_7085,사실형-긍정-현재-확실
7086,TEST_7086,추론형-긍정-현재-확실
7087,TEST_7087,사실형-긍정-미래-확실
7088,TEST_7088,추론형-긍정-미래-확실
