In [5]:
import pandas as pd
import numpy as np
import torch
import os

from transformers import AutoTokenizer, AutoConfig, TrainingArguments, Trainer
from mkdataset import TypeDataset, EmotionDataset, TimeDataset, ConfidenceDataset, TestDataset
from datasets import load_metric, load_dataset
from classifier import RobertaForSequenceClassification
from sklearn.model_selection import train_test_split

In [6]:
seed = 777
os.environ["PYTHONHASHSEED"] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)  # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore

In [7]:
def compute_metrics(pred):
    f1 = load_metric("f1")
    references = pred.label_ids
    predictions = pred.predictions.argmax(axis=1)
    metric = f1.compute(predictions=predictions, references=references, average="micro")
    return metric

In [8]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [9]:
train_type_df, valid_type_df = train_test_split(train_df, test_size=0.3, random_state=seed, stratify=train_df.유형)
train_emotion_df, valid_emotion_df = train_test_split(train_df, test_size=0.3, random_state=seed, stratify=train_df.극성)
train_time_df, valid_time_df = train_test_split(train_df, test_size=0.3, random_state=seed, stratify=train_df.시제)
train_confidence_df, valid_confidence_df = train_test_split(train_df, test_size=0.3, random_state=seed, stratify=train_df.확실성)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return floored.astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/d

In [10]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

In [11]:
train_type_dataset = TypeDataset(data=train_type_df, tokenizer=tokenizer)
train_emotion_dataset = EmotionDataset(data=train_emotion_df, tokenizer=tokenizer)
train_time_dataset = TimeDataset(data=train_df, tokenizer=tokenizer)
train_confidence_dataset = ConfidenceDataset(data=train_confidence_df, tokenizer=tokenizer)
valid_type_dataset = TypeDataset(data=valid_type_df, tokenizer=tokenizer)
valid_emotion_dataset = EmotionDataset(data=valid_emotion_df, tokenizer=tokenizer)
valid_time_dataset = TimeDataset(data=valid_time_df, tokenizer=tokenizer)
valid_confidence_dataset = ConfidenceDataset(data=valid_confidence_df, tokenizer=tokenizer)
test_dataset = TestDataset(data=test_df, tokenizer=tokenizer)

In [12]:
gpu = torch.device("cuda:0")
cpu = torch.device("cpu")

In [13]:
type_config = AutoConfig.from_pretrained("klue/roberta-base")
type_config.num_labels = len(train_df.유형.value_counts())
emotion_config = AutoConfig.from_pretrained("klue/roberta-base")
emotion_config.num_labels = len(train_df.극성.value_counts())
time_config = AutoConfig.from_pretrained("klue/roberta-base")
time_config.num_labels = len(train_df.시제.value_counts())
confidence_config = AutoConfig.from_pretrained("klue/roberta-base")
confidence_config.num_labels = len(train_df.확실성.value_counts())

In [14]:
type_model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", config=type_config)
emotion_model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", config=emotion_config)
time_model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", config=time_config)
confidence_model = RobertaForSequenceClassification.from_pretrained("klue/roberta-base", config=confidence_config)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'ls

In [15]:
type_model.to(gpu)

batch_size = 64
save_steps = 45

In [16]:
training_args = TrainingArguments(
    output_dir="./output_type",
    seed=seed,
    save_total_limit=2,
    save_steps = save_steps,
    num_train_epochs = 3,
    learning_rate= 1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    weight_decay=1e-4,
    logging_dir="./logs",
    logging_steps = save_steps,
    evaluation_strategy = "steps",
    metric_for_best_model = "eval_f1",
    eval_steps = save_steps,
    load_best_model_at_end=True,
)

In [None]:
type_trainer = Trainer(
    model=type_model,
    args=training_args,
    train_dataset=train_type_dataset,
    eval_dataset=valid_type_dataset,
    compute_metrics=compute_metrics,
)

type_trainer.train()
type_model.save_pretrained("./model_type")
pred_tensor = type_trainer.predict(test_dataset)
pred_type = pred_tensor.predictions.argmax(axis=1).tolist()

emotion_trainer = Trainer(
    model=emotion_model,
    args=training_args,
    train_dataset=train_emotion_dataset,
    eval_dataset=valid_emotion_dataset,
    compute_metrics=compute_metrics,
)

emotion_trainer.train()
emotion_model.save_pretrained("./model_emotion")
pred_tensor = emotion_trainer.predict(test_dataset)
pred_emotion = pred_tensor.predictions.argmax(axis=1).tolist()

time_trainer = Trainer(
    model=time_model,
    args=training_args,
    train_dataset=train_type_dataset,
    eval_dataset=valid_type_dataset,
    compute_metrics=compute_metrics,
)

time_trainer.train()

time_model.save_pretrained("./model_time")
pred_tensor = time_trainer.predict(test_dataset)
pred_time = pred_tensor.predictions.argmax(axis=1).tolist()

confidence_trainer = Trainer(
    model=confidence_model,
    args=training_args,
    train_dataset=train_type_dataset,
    eval_dataset=valid_type_dataset,
    compute_metrics=compute_metrics,
)

confidence_trainer.train()

confidence_model.save_pretrained("./model_confidence")
pred_tensor = confidence_trainer.predict(test_dataset)
pred_confidence = pred_tensor.predictions.argmax(axis=1).tolist()