In [1]:
import pandas as pd
import numpy as np
import torch
import os

from transformers import AutoTokenizer, AutoConfig, TrainingArguments, Trainer
from mkdataset import TypeDataset, TestDataset
from datasets import load_metric, load_dataset
from classifier import RobertaForSequenceClassification
from sklearn.model_selection import train_test_split

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"
backbone = "klue/roberta-base"

In [3]:
seed = 777
os.environ["PYTHONHASHSEED"] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)  # type: ignore
torch.backends.cudnn.deterministic = True  # type: ignore
torch.backends.cudnn.benchmark = True  # type: ignore

In [4]:
def compute_metrics(pred):
    f1 = load_metric("f1")
    references = pred.label_ids
    predictions = pred.predictions.argmax(axis=1)
    metric = f1.compute(predictions=predictions, references=references, average="micro")
    return metric

In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [6]:
train_type_df, valid_type_df = train_test_split(train_df, test_size=0.3, random_state=seed, stratify=train_df.유형)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(backbone)

In [8]:
train_dataset = TypeDataset(data=train_df, tokenizer=tokenizer)
train_type_dataset = TypeDataset(data=train_type_df, tokenizer=tokenizer)
valid_type_dataset = TypeDataset(data=valid_type_df, tokenizer=tokenizer)
test_dataset = TestDataset(data=test_df, tokenizer=tokenizer)

In [None]:
def crossentropy_weight(dataset):
    class_counts = pd.DataFrame(dataset.labels).value_counts()
    class_weights = 1./class_counts
    class_weights = class_weights/class_weights.min()
    class_weights = class_weights.to_dict()
    class_weights = {k: v for k, v in sorted(class_weights.items(), key=lambda item: item[0])}
    class_weights = list(class_weights.values())
    class_weights = torch.FloatTensor(class_weights).to(gpu)
    return class_weights

In [9]:
gpu = torch.device("cuda")
cpu = torch.device("cpu")

In [10]:
type_config = AutoConfig.from_pretrained(backbone)
type_config.num_labels = len(train_df.유형.value_counts())

In [11]:
type_model = RobertaForSequenceClassification.from_pretrained(backbone, config=type_config, weight = crossentropy_weight(train_dataset))

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['lstm.bias_ih_l0', 'lstm.weight_hh_l0', 'lstm.bias_hh_l1_revers

In [12]:
batch_size = 64
save_steps = 45

In [13]:
training_args = TrainingArguments(
    output_dir="./output_type",
    seed=seed,
    save_total_limit=2,
    save_steps = save_steps,
    num_train_epochs = 3,
    learning_rate= 1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,
    weight_decay=1e-4,
    logging_dir="./logs",
    logging_steps = save_steps,
    evaluation_strategy = "steps",
    metric_for_best_model = "eval_f1",
    eval_steps = save_steps,
    load_best_model_at_end=True,
)

In [14]:
type_trainer = Trainer(
    model=type_model,
    args=training_args,
    train_dataset=train_type_dataset,
    eval_dataset=valid_type_dataset,
    compute_metrics=compute_metrics,
)

type_trainer.train()
pred_tensor = type_trainer.predict(test_dataset)
pred_type = pred_tensor.predictions.argmax(axis=1).tolist()

***** Running training *****
  Num examples = 11578
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 135


Step,Training Loss,Validation Loss,F1
45,0.5015,0.304188,0.885956
90,0.2653,0.255772,0.898449
135,0.1911,0.263237,0.89724


***** Running Evaluation *****
  Num examples = 4963
  Batch size = 64
Saving model checkpoint to ./output_type/checkpoint-45
Configuration saved in ./output_type/checkpoint-45/config.json
Model weights saved in ./output_type/checkpoint-45/pytorch_model.bin
Deleting older checkpoint [output_type/checkpoint-362] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4963
  Batch size = 64
Saving model checkpoint to ./output_type/checkpoint-90
Configuration saved in ./output_type/checkpoint-90/config.json
Model weights saved in ./output_type/checkpoint-90/pytorch_model.bin
Deleting older checkpoint [output_type/checkpoint-543] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4963
  Batch size = 64
Saving model checkpoint to ./output_type/checkpoint-135
Configuration saved in ./output_type/checkpoint-135/config.json
Model weights saved in ./output_type/checkpoint-135/pytorch_model.bin
Deleting older checkpoint [output_type/checkpoint-45] d

In [15]:
result = pd.DataFrame(pred_type, columns=["type"])
test_df_type = pd.concat([test_df,result],axis=1, ignore_index=True)
test_df_type.to_csv("result_type.csv")