In [None]:
!pip install datasets transformers["ja"] wandb

In [None]:
%cd "/content/drive/MyDrive/Colab Notebooks/introduction_to_huggingface"

In [None]:
import gc
import os
import shutil
import warnings

import datasets
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback,
    set_seed,
)
import yaml

warnings.filterwarnings("ignore")

In [None]:
def create_df(path, data_size=None):
    df = pd.read_csv(path, sep="\t")
    if data_size is not None:
        df = df[:data_size]
    if "star_rating" in df.columns:
        df = df.rename(columns={"star_rating": "labels"})
    return df

def create_dataset(df, tokenizer, tokenizer_kwargs=None):
    source_col = "review_body"
    
    def tokenize_function(example, tokenizer, tokenizer_kwargs=None):
        return tokenizer(example[source_col], **tokenizer_kwargs)
    
    if "labels" in df.columns:
        df = df[[source_col, "labels"]]
    else:
        df = df[[source_col]]
        
    dataset = datasets.Dataset.from_pandas(df)
    if tokenizer_kwargs is None:
        tokenizer_kwargs = {}
    dataset = dataset.map(
        tokenize_function,
        batched=True, 
        remove_columns=[source_col, "__index_level_0__"], 
        fn_kwargs={"tokenizer": tokenizer, "tokenizer_kwargs": tokenizer_kwargs}
    )
    return dataset

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = logits.squeeze()
    preds = np.argmax(logits, axis=1)
    return {"f1": f1_score(labels, preds, average="micro"), "acc": accuracy_score(labels, preds)}

In [None]:
config_str = """
path: 
    data_path: "data/amazon_reveiws_and_ratings.tsv"
    checkpoint_path: "checkpoint/bert-base-japanese-v2"

model_name: "cl-tohoku/bert-base-japanese-v2"

num_labels: 5

tokenizer:
    padding: "do_not_pad"
    truncation: True
    max_length: 256

trainer:
    evaluation_strategy: "epoch"
    per_device_train_batch_size: 16
    per_device_eval_batch_size: 16
    learning_rate: 2.0e-5
    weight_decay: 0.01
    num_train_epochs: 5
    lr_scheduler_type: "linear"
    warmup_steps: 0
    log_level: "warning"
    logging_strategy: "steps"
    logging_steps: 100
    save_strategy: "epoch"
    save_steps: 1
    save_total_limit: 1
    fp16: True
    eval_steps: 1
    dataloader_num_workers: 2
    load_best_model_at_end: True
    metric_for_best_model: "eval_loss"
    greater_is_better: False
    report_to: "wandb"

wandb:
    api_key: "937bbe73dbff368d413f9b2a99ed980d9eb1fd8c"
    project: "introduction_to_huggingface"

early_stopping_patience: 2

seed: 42
"""

config = yaml.safe_load(config_str)

In [None]:
torch.cuda.empty_cache()
gc.collect()

os.environ["WANDB_API_KEY"] = config["wandb"]["api_key"]
os.environ["WANDB_PROJECT"] = config["wandb"]["project"]

set_seed(config["seed"])

all_df = create_df(config["path"]["data_path"], data_size=300)
train_df, valid_test_df = train_test_split(all_df, test_size=2/6, random_state=config["seed"])
valid_df, test_df = train_test_split(valid_test_df, test_size=1/2, random_state=config["seed"])

tokenizer = AutoTokenizer.from_pretrained(config["model_name"])

train_dataset = create_dataset(train_df, tokenizer, tokenizer_kwargs=config["tokenizer"])
valid_dataset = create_dataset(valid_df, tokenizer, tokenizer_kwargs=config["tokenizer"])
test_dataset = create_dataset(test_df, tokenizer, tokenizer_kwargs=config["tokenizer"])


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(config["model_name"], num_labels=config["num_labels"])

early_stopping_callback = EarlyStoppingCallback(config["early_stopping_patience"])

temp_dir = "checkpoints/temp"
training_args = TrainingArguments(
    output_dir=temp_dir,
    seed=config["seed"],
    run_name=config["model_name"],
    **config["trainer"]
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)
trainer.train()

eval_result = trainer.evaluate(valid_dataset)

shutil.rmtree(temp_dir)
trainer.save_model(config["path"]["checkpoint_path"])

In [None]:
eval_result

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config["path"]["checkpoint_path"])
model = AutoModelForSequenceClassification.from_pretrained(config["path"]["checkpoint_path"], num_labels=config["num_labels"])
trainer = Trainer(model=model, tokenizer=tokenizer)
test_result = trainer.predict(test_dataset)
prediction = np.argmax(test_result.predictions, axis=1)

In [None]:
test_result

In [None]:
prediction