In [None]:
import pandas as pd
import numpy as np
import torch
import os

from transformers import AutoTokenizer, AutoConfig, TrainingArguments
from trainer import Trainer
from custom_dataset import LabelDataset, TestDataset
from datasets import load_metric, load_dataset
from classifier import RobertaSpecialTokenForSequenceClassification, RobertaForSequenceClassification
from sklearn.model_selection import StratifiedKFold
from utils import set_allseed
import warnings
import pickle
from augmentation import Augmentation

In [None]:
seed = 60
batch_size = 16
save_steps = 103
set_allseed(seed)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "1"
gpu = torch.device("cuda")
warnings.filterwarnings(action='ignore')
model_init = "klue/roberta-base"
name = "rdrop_logit_seed_{}.pickle".format(seed)

In [None]:
def compute_metrics(pred):
    f1 = load_metric("f1")
    references = pred.label_ids
    predictions = pred.predictions.argmax(axis=1)
    metric = f1.compute(predictions=predictions, references=references, average="micro")
    return metric

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df = train_df[["문장","label"]]

In [None]:
kfold_function = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
tokenizer = AutoTokenizer.from_pretrained(model_init)
config = AutoConfig.from_pretrained(model_init)
config.num_labels = len(train_df.label.value_counts())
config.cls_token_id = tokenizer.cls_token_id
config.eos_token_id = tokenizer.eos_token_id

In [None]:
aug = Augmentation(tokenizer = tokenizer)
aug_data = aug(train_df).reset_index()

In [None]:
training_args = TrainingArguments(
    output_dir="./seed_{seed}}",
    seed=seed,
    save_total_limit=2,
    save_steps = save_steps,
    num_train_epochs = 5,
    learning_rate= 1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    weight_decay=1e-4,
    logging_dir="./logs",
    logging_steps = save_steps,
    evaluation_strategy = "steps",
    eval_steps = save_steps,
    load_best_model_at_end=True,
)
test_dataset = TestDataset(data=test_df, tokenizer=tokenizer)

In [None]:
logit = 0
for i, (train_index, test_index) in enumerate(kfold_function.split(aug_data["문장"],aug_data["label"])):
    model = RobertaSpecialTokenForSequenceClassification.from_pretrained(model_init, config=config)
    train_corpus, valid_corpus = aug_data["문장"][train_index], aug_data["문장"][test_index]
    train_label, valod_label = aug_data["label"][train_index], aug_data["label"][test_index]
    fold_train = pd.concat([train_corpus, train_label], axis = 1)
    fold_valid = pd.concat([valid_corpus, valod_label], axis = 1)
    train_dataset = LabelDataset(data=fold_train, tokenizer=tokenizer)
    valid_dataset = LabelDataset(data=fold_valid, tokenizer=tokenizer)

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    )

    trainer.train()

    logit += trainer.predict(test_dataset).predictions / 5

In [None]:
with open(name,"wb") as f:
    pickle.dump(logit, f)

In [None]:
"""
with open(name,"rb") as f:
    logit = pickle.load(f)
    
result = pd.DataFrame(logit.argmax(axis=1).tolist(), columns=["type"])
test_df_result = pd.concat([test_df,result],axis=1, ignore_index=True)
test_df_result.to_csv("result_rdrop.csv")"""