In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
%cd ..

/jupyter-lab/repo/bigstar


# Import Modules

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

import pickle
from collections import Counter
from functools import partial
from pathlib import Path
from typing import Dict, Iterable, Tuple

import joblib
import matplotlib.pyplot as plt
import mecab_ko as mecab
import nlpaug.augmenter.word as naw
import numpy as np
import pandas as pd
import scipy.special as sp
import seaborn as sns
import torch
from nlpaug.util import Action
from scipy.sparse import csr_matrix
from scipy.special import softmax
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_recall_fscore_support,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from src.base_trainer import get_ckpt_path, get_model_hparams, load_model_state
from src.baseline.models import BaselineModel, BaselineModelWithMLAttention
from src.baseline.trainer import BaselineTrainerModel
from src.datasets import LotteQADataset, collate_fn
from src.eda import EDA
from src.utils import (
    delete_list_elements,
    filter_arguments,
    get_label_encoder,
    get_n_samples,
)
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    AutoTokenizer,
    MarianMTModel,
)
from transformers.models.marian.convert_marian_to_pytorch import (
    load_config_from_state_dict,
)
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

plt.rcParams["font.family"] = "NanumGothic"

# Functions

In [3]:
def predict(run_id, **dataset_kwargs):
    ckpt_path = get_ckpt_path(log_dir="./logs", run_id=run_id, load_best=True)
    model_hparams = get_model_hparams(
        "./logs", run_id, BaselineTrainerModel.MODEL_HPARAMS
    )
    le = get_label_encoder("./cache/label_encoder.joblib")

    model_cls = (
        BaselineModel
        if model_hparams["model_name"] == "Baseline"
        else BaselineModelWithMLAttention
    )

    model = model_cls(
        num_labels=len(le.classes_), **filter_arguments(model_hparams, model_cls)
    )

    load_model_state(model, ckpt_path, substitution=(r"^model\.", ""))

    test_dataset = LotteQADataset(**dataset_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(model_hparams["pretrained_model_name"])
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=16,
        pin_memory=True,
        num_workers=4,
        collate_fn=partial(
            collate_fn,
            tokenizer=tokenizer,
            le=le,
            max_length=model_hparams["max_length"],
        ),
    )

    device = torch.device("cuda")

    model.to(device)
    model.eval()
    probs = []

    for batch_x, batch_y in tqdm(test_dataloader):
        batch_x = {k: v.to(device) for k, v in batch_x.items()}
        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = model(batch_x)
        probs.append(outputs.cpu())

    probs = np.concatenate(probs)
    predictions = probs.argmax(axis=-1)

    gt = le.transform(test_dataset.y)
    f1_micro = f1_score(gt, predictions, average="micro")
    prec_macro, recall_macro, f1_macro, support = precision_recall_fscore_support(
        gt,
        predictions,
        average="macro",
        zero_division=0,
    )
    prec_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        gt,
        predictions,
        average="weighted",
        zero_division=0,
    )
    prec, recall, f1, _ = precision_recall_fscore_support(
        gt, predictions, average=None, zero_division=0
    )

    print(f"f1        (micro): {f1_micro:.4f}")
    print(f"prec      (macro): {prec_macro:.4f}")
    print(f"recall    (macro): {recall_macro:.4f}")
    print(f"f1        (macro): {f1_macro:.4f}")
    print(f"prec   (weighted): {prec_weighted:.4f}")
    print(f"recall (weighted): {recall_weighted:.4f}")
    print(f"f1     (weighted): {f1_weighted:.4f}")

    matrix = confusion_matrix(gt, predictions)

    data = {
        "intent": le.classes_,
        "support": matrix.sum(axis=-1),
        "# to tp": np.diag(matrix),
        "# of predicted": matrix.sum(axis=0),
    }

    df = pd.DataFrame(data)

    return {
        "df": df,
        "matrix": matrix,
        "f1_micro": f1_micro,
        "prec_macro": prec_macro,
        "recall_macro": recall_macro,
        "prec_weighted": prec_weighted,
        "recall_weighted": recall_weighted,
        "f1_weighted": f1_weighted,
        "prec": prec,
        "recall": recall,
        "f1": f1,
        "gt": gt,
        "predictions": predictions,
        "probs": probs,
    }


def plot_label_distribution(df, key="인텐트", columns=None):
    value_counts = df[key].value_counts()
    if columns is not None:
        value_counts = value_counts.loc[columns]
    plt.figure(figsize=(16, 8))
    sns.barplot(x=value_counts.index, y=value_counts.to_numpy())

# Analysis

In [4]:
results = joblib.load("outputs/pred.joblib")
results1 = results[0]  # Baseline prediction (test set)
results2 = results[1]  # Prediction trained with augmented train+back.v9 (test set)
results3 = results[2]  # Prediction trained with original (back.v1)

In [6]:
len(results1['gt']), len(results2['gt']), len(results3['gt'])

(84814, 84814, 424220)

In [9]:
results1['f1_weighted'], results2['f1_weighted'], results3['f1_weighted']

(0.9312974972660338, 0.9295696294139754, 0.6060449461970544)

# GPT2

In [669]:
pretrained_model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

In [707]:
text = "주문"
inputs = tokenizer(text, return_tensors='pt')
inputs

{'input_ids': tensor([[16812]]), 'attention_mask': tensor([[1]])}

In [None]:
model.to(device)

In [726]:
%%time
gen_ids = model.generate(
    **inputs,
    max_length=128,
    repetition_penalty=2.0,
)
generated = tokenizer.batch_decode(gen_ids)
print(generated[0])

주문했다.
이날 오전 9시30분께 서울 종로구 세종로 정부중앙청사 별관 2층 회의실에서 열린 이 전 대통령과의 독대에서는 김백준 총무기획관이 "대통령께서 (김대중)전 대통령의 서거에 대해 애도를 표하고 유족들에게 위로의 말씀을 드린다"고 말했다.
이어 "이번 사태는 김대중 대통령이 지난해 12월19일 대국민담화에서 밝힌 대로 국민통합과 국가발전을 위해 혼신의 노력을 다하는 과정에서 발생한 것"이라며 이같이 밝혔다.
그는 또 박근혜 한나라당 대표의 '독도 방문과 관련,'박 대표가 직접 나서야 한다는 주장에 대해서는 일축
CPU times: user 2min 17s, sys: 77.2 ms, total: 2min 17s
Wall time: 3.87 s


In [723]:
sorted(tokenizer.get_vocab().items(), key=lambda key: key[1])

[('<s>', 0),
 ('</s>', 1),
 ('<usr>', 2),
 ('<pad>', 3),
 ('<sys>', 4),
 ('<unk>', 5),
 ('<mask>', 6),
 ('<d>', 7),
 ('</d>', 8),
 ('<unused0>', 9),
 ('<unused1>', 10),
 ('<unused2>', 11),
 ('<unused3>', 12),
 ('<unused4>', 13),
 ('<unused5>', 14),
 ('<unused6>', 15),
 ('<unused7>', 16),
 ('<unused8>', 17),
 ('<unused9>', 18),
 ('<unused10>', 19),
 ('<unused11>', 20),
 ('<unused12>', 21),
 ('<unused13>', 22),
 ('<unused14>', 23),
 ('<unused15>', 24),
 ('<unused16>', 25),
 ('<unused17>', 26),
 ('<unused18>', 27),
 ('<unused19>', 28),
 ('<unused20>', 29),
 ('<unused21>', 30),
 ('<unused22>', 31),
 ('<unused23>', 32),
 ('<unused24>', 33),
 ('<unused25>', 34),
 ('<unused26>', 35),
 ('<unused27>', 36),
 ('<unused28>', 37),
 ('<unused29>', 38),
 ('<unused30>', 39),
 ('<unused31>', 40),
 ('<unused32>', 41),
 ('<unused33>', 42),
 ('<unused34>', 43),
 ('<unused35>', 44),
 ('<unused36>', 45),
 ('<unused37>', 46),
 ('<unused38>', 47),
 ('<unused39>', 48),
 ('<unused40>', 49),
 ('<unused41>', 50),