In [1]:
import os
import json
import torch
import wandb
import pickle
import random
import evaluate
import numpy as np
import pandas as pd
from datasets import (
    Dataset,
    DatasetDict,
    load_from_disk,
)
from easydict import EasyDict
from transformers import (
    Trainer,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)

EXPERIMENT_NAME = "koelectra-dialect-v1-1e-05-32-5"

In [2]:
def set_seeds(seed=random.randrange(1, 10000)):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
# # 한 번 실행하여 데이터셋 변환 후에는 실행하지 말기
# train_df = pd.read_csv("./train.csv", index_col="idx")
# eval_df = pd.read_csv("./valid.csv", index_col="idx")
# test_df = pd.read_csv("./test.csv", index_col="idx")

# dataset = DatasetDict({
#     "train": Dataset.from_pandas(train_df),
#     "evaluation": Dataset.from_pandas(eval_df),
#     "test": Dataset.from_pandas(test_df),
# })
# dataset.save_to_disk("./data/dialect-dataset")

In [4]:
def get_args():
    with open(f"./args/{EXPERIMENT_NAME}.json") as f:
        args = EasyDict(json.load(f))
    assert EXPERIMENT_NAME == args.wandb_name, "Please check whether the config file matches."
    return args

In [5]:
def compute_metrics(eval_preds):
    metrics = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metrics.compute(predictions=predictions, references=labels)

In [14]:
def main():
    set_seeds(42)

    args = get_args()

    wandb.init(
        project=args.wandb_project,
        entity=args.wandb_entity,
        name=args.wandb_name,
        group=args.wandb_group,
    )
    wandb.config.update({
        "initial_learning_rate": args.learning_rate,
        "num_epochs": args.num_epochs,
        "batch_size": args.batch_size,
    })

    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=args.model_name_or_path,
        num_labels=3,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=args.tokenizer_name_or_path,
    )
    raw_dataset = load_from_disk("./data/dialect-dataset/")
    dataset = raw_dataset.map(
        lambda data: tokenizer(
            data["text"],
            truncation=True,
            padding="max_length",
            max_length=args.max_token_length,
        ),
        batched=True,
        remove_columns=["text", "idx"],
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        # Hyperparameters
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=1,
        num_train_epochs=args.num_epochs,
        learning_rate=args.learning_rate,
        lr_scheduler_type="linear",
        warmup_ratio=0.5,
        fp16=False,
        # Logging & Evaluation
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        # Saving
        save_strategy="epoch",
        save_total_limit=args.save_total_limit,
        load_best_model_at_end=True,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["evaluation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    trainer.evaluate()
    trainer.save_model(output_dir=args.output_dir)

    label = trainer.predict(dataset["test"].remove_columns(["label"]))
    result = pd.DataFrame()
    result["idx"] = raw_dataset["test"]["idx"]
    result["label"] = label[0].argmax(axis=-1)
    result.to_csv(f"./submission/{EXPERIMENT_NAME}.csv", index=False)

In [None]:
main()