<a href="https://colab.research.google.com/github/tchih11/qiita_eda/blob/main/notebooks/03_eda_verification_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 各種インストール

In [1]:
!git clone https://tchih11:@github.com/tchih11/qiita_eda.git

Cloning into 'qiita_eda'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 41 (delta 15), reused 6 (delta 1), pack-reused 0[K
Unpacking objects: 100% (41/41), done.


In [2]:
# early stopping 用
import os
!git clone https://github.com/Bjarten/early-stopping-pytorch.git
os.rename('early-stopping-pytorch','early_stopping_pytorch')

Cloning into 'early-stopping-pytorch'...
remote: Enumerating objects: 92, done.[K
remote: Total 92 (delta 0), reused 0 (delta 0), pack-reused 92[K
Unpacking objects: 100% (92/92), done.


In [3]:
%%capture
!pip install transformers==3.5.1
!pip install fugashi
!pip install ipadic

In [4]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from early_stopping_pytorch.pytorchtools import EarlyStopping
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import BertForSequenceClassification
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer

In [5]:
pretrained_model = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=15683.0, style=ProgressStyle(descriptio…




# 関数の定義


In [17]:
def tokenizer_512(input_text):
    """
    文章をtokenizeしてpytorchのTensorに変換

    Args:
        input_text (str): tokenizeしたい文章

    Returns:
        Tensor
    """    
    return tokenizer.encode_plus(
                    input_text,                      
                    add_special_tokens = True,
                    max_length = max_length,
                    padding = "max_length",
                    truncation=True,
                    return_tensors = 'pt',
                )["input_ids"][0]

def make_torch_dataset(df, text_col, label_col, tokenizer):
    """
    pandasのDataFrameで作成したデータをtokenizeしてpytorchのTensorDatasetへ変換

    Args:
        df (DataFrame): TensorDatasetへ変換するDataFrame
        text_col (str): DataFrameの文章が格納されているカラム名
        label_col (str): DataFrameの正解ラベルが格納されているカラム名
        tokenizer (function): 使用するtokenizer

    Returns:
        TensorDataset
    """
    label = df[label_col]
    input_ids = []
    for item in df[text_col].apply(tokenizer):
        text = item.view(1, -1)
        input_ids.append(text)
        
    ids = torch.cat(input_ids, dim=0)
    label = torch.tensor(list(label))
    
    return TensorDataset(ids, label)


def stratified_train_test_split_split(df,label_col,test_size=0.2):
    """
    DataFrameを層化分割
    
    Args:
        df (DataFrame): DataFrameの正解ラベルが格納されているカラム名
        label_col (str): DataFrameの正解ラベルが格納されているカラム名
        test_size (float, optional): testの割合. Defaults to 0.2.

    Returns:
        DataFrame
    """
    df.reset_index(drop=True,inplace=True)
    train, eval = train_test_split(df,stratify=df[label_col],random_state=0,test_size=0.2)
    train_df = df.loc[train.index]
    eval_df = df.loc[eval.index]
    return train_df, eval_df
    

def train_model(net, dl_train, dl_eval, device, criterion, optimizer, patience=3, batch_size=16, n_epochs=20):
    """
    early stoppingを使用したモデルの学習。

    Args:
        net (model): 学習の元となるモデル
        dl_train (DataLoader): train用のDataLoader
        dl_eval (DataLoader): eval用のDataLoader
        device (device): 使用するdevice
        criterion (criterion): 使用する損失関数
        optimizer (optim): 使用するoptimizer
        patience (int, optional): 指定のepoch数、指標が改善しなければearly stopping. Defaults to 3.
        batch_size (int, optional): バッチサイズ. Defaults to 16.
        n_epochs (int, optional): エポック数. Defaults to 20.

    Returns:
        model
    """    

    net.to(device)
    batch_size = dl_train.batch_size

    train_losses = []
    valid_losses = []
    avg_train_losses = []
    avg_valid_losses = [] 
    
    early_stopping = EarlyStopping(patience=patience, delta=0.005, verbose=True)
    for epoch in tqdm(range(1, n_epochs + 1),total=n_epochs,position=0 ,leave=True, desc="train"):
        net.train()
        for batch in dl_train:
            data = batch[0].to(device)  # 文章
            target = batch[1].to(device)  # ラベル
            optimizer.zero_grad()
            output = net(data)[0]
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        net.eval()
        for batch in dl_eval:
            data = batch[0].to(device)  # 文章
            target = batch[1].to(device)  # ラベル
            output = net(data)[0]
            loss = criterion(output, target)
            valid_losses.append(loss.item())

        train_loss = np.average(train_losses)
        valid_loss = np.average(valid_losses)
        avg_train_losses.append(train_loss)
        avg_valid_losses.append(valid_loss)
        
        epoch_len = len(str(n_epochs))
        
        print_msg = (f'[{epoch:>{epoch_len}}/{n_epochs:>{epoch_len}}] ' +
                     f'train_loss: {train_loss:.5f} ' +
                     f'valid_loss: {valid_loss:.5f}')
        
        print(print_msg)
        
        train_losses = []
        valid_losses = []
        
        early_stopping(valid_loss, net)
        
        if early_stopping.early_stop:
            print("Early stopping")
            break
        
    net.load_state_dict(torch.load('checkpoint.pt'))

    return net

def valid_test(net_trained, device, dl_test):
    """
    testデータへ当てはめ、accuracyの算出

    Args:
        net_trained (model): 学習済みモデル
        device (device): 使用するdevice
        dl_test (DataLoader): test用のDataLoader

    Returns:
        float: testデータのaccuracy
    """    
    net_trained.eval()
    net_trained.to(device)
    epoch_corrects = 0

    for batch in tqdm(dl_test,position=0 ,leave=True,desc="predict"):
        data = batch[0].to(device)  # 文章
        target = batch[1].to(device)  # ラベル

        with torch.set_grad_enabled(False):
            outputs = net_trained(data)[0]
            _, preds = torch.max(outputs, 1)
            epoch_corrects += torch.sum(preds == target)

    epoch_acc = epoch_corrects.double() / len(dl_test.dataset)
    return epoch_acc

In [7]:
# 実行用
def modeling(train_eval_df,test_df,num_epochs=20):
    
    # GPU設定
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # モデリング用データをtrainとevalに層化分割
    train_df, eval_df = stratified_train_test_split_split(train_eval_df,label_col="label_index")

    # datasetに変換
    dataset_train = make_torch_dataset(train_df, "text", "label_index", tokenizer_512)
    dataset_eval = make_torch_dataset(eval_df, "text", "label_index", tokenizer_512)
    dataset_test = make_torch_dataset(test_df, "text", "label_index", tokenizer_512)

    # dataloader作成
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(1)
    dl_train = DataLoader(dataset_train,batch_size = batch_size)
    dl_eval = DataLoader(dataset_eval,batch_size = batch_size)
    dl_test = DataLoader(dataset_test,batch_size = batch_size)
    
    # モデル構築
    net = BertForSequenceClassification.from_pretrained(pretrained_model, num_labels=9)
    net.train()

    # 重みを変更する個所
    for param in net.parameters():
        param.requires_grad = False
    for param in net.bert.encoder.layer[-1].parameters():
        param.requires_grad = True
    for param in net.classifier.parameters():
        param.requires_grad = True

    # 最適化手法
    optimizer = optim.Adam([
        {'params': net.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
        {'params': net.classifier.parameters(), 'lr': 1e-4}
    ])

    # 損失関数
    criterion = nn.CrossEntropyLoss()      
    
    # 訓練実施
    net_trained =  train_model(net, dl_train, dl_eval, device, criterion, optimizer)
    
    # テストデータで検証
    epoch_acc = valid_test(net_trained, device, dl_test)
    
    return epoch_acc.item()

# モデリング、検証
train dataは  
*   データ数 = [500, 1000, 2000]  
*   alpha = [0.05, 0.10]  
*   aug_num = [0, 1, 4, 8, 16]  

の全組み合わせを試すため、for文を回す。  
※回りきらない可能性があるため、データ数は[500, 1000]と[2000]に分けるのがよい

In [9]:
# パラメータの設定
batch_size = 16
max_length = 512

# testデータの読み込み
test_df = pd.read_pickle("/content/qiita_eda/data/test_df.pkl")
test_df.columns = ["text","label_index"]

# trainデータの設定
pickles_path = [
                "/content/qiita_eda/data/train_eval_eda_2000_5_16_gzip.pkl", # alpha=0.05で作成したデータセット
                "/content/qiita_eda/data/train_eval_eda_2000_10_16_gzip.pkl" # alpha=0.10で作成したデータセット
                ]                
alpha_list = [5,10]
sampling_list = [500, 1000]
# sampling_list = [2000]
num_agg_list = [0,1,4,8,16]

In [None]:
# 結果確認用
result = pd.DataFrame(columns=["sampling_n","alpha","num_agg","accuracy","size"])

for path,alpha in zip(pickles_path,alpha_list):

    # alpha=○○で作成したデータセットを読み込む
    train_eval_all = pd.read_pickle(path,compression="gzip")
    train_eval_all.reset_index(drop=True,inplace=True)

    unique_text_ids = list(train_eval_all.text_id.unique())
    raw_aug_id = train_eval_all.loc[train_eval_all.raw_flg==1,"aug_id"].unique()[0]

    for sampling_n in sampling_list:
        
        # データの件数を絞る
        random.seed(0)
        text_id_list = random.sample(unique_text_ids,sampling_n)
        text_id_sampled = train_eval_all[train_eval_all.text_id.isin(text_id_list)]
        
        for i ,num_agg in enumerate(num_agg_list):

            # num_agg=0（EDAなし）は最初の一回のみ
            if alpha!=alpha_list[0] and num_agg==0:
                continue
            
            # 各文章について、EDAを行った文章n件 + 原文を抽出
            aug_ids = list(text_id_sampled.loc[text_id_sampled.raw_flg==0,"aug_id"].unique())
            random.seed(0)
            aug_id_list = random.sample(aug_ids,num_agg)
            aug_id_list.append(raw_aug_id) #原文を追加
            train_eval_df = text_id_sampled.loc[text_id_sampled.aug_id.isin(aug_id_list),["text","label_index"]]

            # モデリング、精度算出
            acc = modeling(train_eval_df,test_df)

            # 結果の保存
            idx = len(result)
            result.loc[idx] = [sampling_n, alpha, num_agg, acc,len(train_eval_df)]
            display(result)