In [1]:
import pandas as pd
import numpy as np
import torch
import os

from transformers import AutoTokenizer, AutoConfig, TrainingArguments, EarlyStoppingCallback

from trainer import Trainer
from custom_dataset import LabelDataset, TestDataset

from custom_model import RobertaSpecialTokenForSequenceClassification

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

from utils import set_allseed

import warnings
import pickle

from augmentation import Augmentation

In [2]:
idx2label = {0: '사실형-긍정-현재-확실',
 1: '사실형-긍정-과거-확실',
 2: '사실형-긍정-미래-확실',
 3: '추론형-부정-현재-확실',
 4: '예측형-긍정-미래-불확실',
 5: '추론형-긍정-현재-확실',
 6: '추론형-긍정-과거-확실',
 7: '추론형-긍정-현재-불확실',
 8: '대화형-긍정-미래-확실',
 9: '사실형-미정-현재-확실',
 10: '사실형-부정-과거-확실',
 11: '예측형-부정-과거-확실',
 12: '추론형-긍정-미래-확실',
 13: '사실형-긍정-미래-불확실',
 14: '대화형-긍정-현재-확실',
 15: '사실형-부정-현재-확실',
 16: '대화형-긍정-과거-확실',
 17: '사실형-긍정-과거-불확실',
 18: '사실형-긍정-현재-불확실',
 19: '대화형-긍정-현재-불확실',
 20: '예측형-미정-미래-불확실',
 21: '예측형-긍정-미래-확실',
 22: '추론형-부정-미래-확실',
 23: '사실형-미정-미래-확실',
 24: '추론형-긍정-미래-불확실',
 25: '대화형-부정-과거-확실',
 26: '대화형-긍정-미래-불확실',
 27: '대화형-미정-미래-불확실',
 28: '추론형-미정-미래-불확실',
 29: '추론형-부정-미래-불확실',
 30: '추론형-부정-과거-확실',
 31: '사실형-미정-미래-불확실',
 32: '추론형-긍정-과거-불확실',
 33: '예측형-긍정-현재-확실',
 34: '사실형-부정-과거-불확실',
 35: '예측형-긍정-과거-확실',
 36: '예측형-긍정-과거-불확실',
 37: '대화형-긍정-과거-불확실',
 38: '대화형-미정-과거-불확실',
 39: '사실형-부정-미래-확실',
 40: '추론형-부정-현재-불확실',
 41: '사실형-미정-현재-불확실',
 42: '대화형-미정-현재-불확실',
 43: '예측형-부정-현재-불확실',
 44: '대화형-부정-현재-불확실',
 45: '예측형-긍정-현재-불확실',
 46: '추론형-미정-미래-확실',
 47: '사실형-부정-미래-불확실',
 48: '추론형-미정-현재-불확실',
 49: '대화형-부정-현재-확실',
 50: '사실형-미정-과거-확실',
 51: '추론형-부정-과거-불확실',
 52: '사실형-부정-현재-불확실',
 53: '대화형-부정-미래-확실',
 54: '예측형-미정-현재-확실',
 55: '예측형-미정-현재-불확실',
 56: '예측형-부정-미래-불확실',
 57: '대화형-미정-미래-확실',
 58: '대화형-미정-과거-확실',
 59: '추론형-미정-현재-확실',
 60: '대화형-부정-과거-불확실',
 61: '추론형-미정-과거-불확실',
 62: '예측형-미정-미래-확실',
 63: '예측형-미정-과거-확실'}

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df = train_df[["문장","label"]]

In [4]:
seed = 2
batch_size = 8
save_steps = 773
set_allseed(seed)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0"
gpu = torch.device("cuda")
warnings.filterwarnings(action='ignore')
model_init = "klue/roberta-large"
name = "rdrop_logit_seed_{}.pickle".format(seed)

In [5]:
def compute_metrics(pred):
    predict = pred.predictions.argmax(axis=1)
    ref = pred.label_ids
    pred_li, ref_li = [], []
    for i, j in zip (predict, ref):
        prediction, reference = [0] * len(idx2label), [0] * len(idx2label)
        prediction[i] = 1
        reference[j] = 1
        pred_li.append(prediction)
        ref_li.append(reference)
    f1 = f1_score(pred_li, ref_li, average="weighted")
    return {'f1' : f1 }

In [6]:
kfold_function = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
tokenizer = AutoTokenizer.from_pretrained(model_init)
config = AutoConfig.from_pretrained(model_init)
config.num_labels = len(train_df.label.value_counts())
config.cls_token_id = tokenizer.cls_token_id
config.eos_token_id = tokenizer.eos_token_id

In [7]:
aug = Augmentation(tokenizer = tokenizer)
aug_data = aug(train_df).reset_index()

Category : 사실형-긍정-현재-확실 	 Previous size : 4743, Current size : 3794
Category : 사실형-긍정-과거-확실 	 Previous size : 7113, Current size : 5690
Category : 사실형-긍정-미래-확실 	 Previous size : 649, Current size : 649
Category : 추론형-부정-현재-확실 	 Previous size : 93, Current size : 93
Category : 예측형-긍정-미래-불확실 	 Previous size : 141, Current size : 141
Category : 추론형-긍정-현재-확실 	 Previous size : 1101, Current size : 1101
Category : 추론형-긍정-과거-확실 	 Previous size : 335, Current size : 335
Category : 추론형-긍정-현재-불확실 	 Previous size : 130, Current size : 130
Category : 대화형-긍정-미래-확실 	 Previous size : 26, Current size : 26
Category : 사실형-미정-현재-확실 	 Previous size : 18, Current size : 20
Category : 사실형-부정-과거-확실 	 Previous size : 183, Current size : 183
Category : 예측형-부정-과거-확실 	 Previous size : 1, Current size : 20
Category : 추론형-긍정-미래-확실 	 Previous size : 204, Current size : 204
Category : 사실형-긍정-미래-불확실 	 Previous size : 236, Current size : 236
Category : 대화형-긍정-현재-확실 	 Previous size : 257, Current size : 257
Category :

100%|██████████| 14675/14675 [00:01<00:00, 12856.91it/s]


In [8]:
training_args = TrainingArguments(
    output_dir="./{}".format(seed),
    seed=seed,
    save_total_limit=2,
    save_steps = save_steps,
    num_train_epochs = 5,
    learning_rate= 3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=2,
    weight_decay=0.001,
    warmup_ratio=0.05,
    warmup_steps=save_steps,
    logging_dir="./logs",
    logging_steps = save_steps,
    evaluation_strategy = "steps",
    eval_steps = save_steps,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
)
test_dataset = TestDataset(data=test_df, tokenizer=tokenizer)

In [None]:
logitlist = []
for i, (train_index, test_index) in enumerate(kfold_function.split(aug_data["문장"],aug_data["label"])):
    model = RobertaSpecialTokenForSequenceClassification.from_pretrained(model_init, config=config)
    train_corpus, valid_corpus = aug_data["문장"][train_index], aug_data["문장"][test_index]
    train_label, valid_label = aug_data["label"][train_index], aug_data["label"][test_index]
    fold_train = pd.concat([train_corpus, train_label], axis = 1)
    fold_valid = pd.concat([valid_corpus, valid_label], axis = 1)
    train_dataset = LabelDataset(data=fold_train, tokenizer=tokenizer)
    valid_dataset = LabelDataset(data=fold_valid, tokenizer=tokenizer)

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    )

    trainer.train()
    break

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaSpecialTokenForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaSpecialTokenForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaSpecialTokenForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaSpecialTokenForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.d

Step,Training Loss,Validation Loss,F1
773,1.6392,1.227466,0.701953


***** Running Evaluation *****
  Num examples = 2935
  Batch size = 8
Saving model checkpoint to ./2/checkpoint-773
Configuration saved in ./2/checkpoint-773/config.json
Model weights saved in ./2/checkpoint-773/pytorch_model.bin


In [None]:
with open(name,"wb") as f:
    pickle.dump(logit, f)

In [None]:
"""
with open(name,"rb") as f:
    logit = pickle.load(f)
    
result = pd.DataFrame(logit.argmax(axis=1).tolist(), columns=["type"])
test_df_result = pd.concat([test_df,result],axis=1, ignore_index=True)
test_df_result.to_csv("result_rdrop.csv")"""