In [1]:
import copy
import random

import numpy as np
import pandas as pd
import torch
from scipy import stats
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer

from util import calc_accuracy, calc_f1, init_device, load_params
from util.bert import sentence_to_loader

In [2]:
# ランダムシード初期化
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
device = init_device()

GPU available: cuda


In [3]:
# パラメータ読み込み
print("Loading parameters...")
params = load_params("/workspace/amazon_review/config/params_mmd.json")
params["batch_size"] = 4

Loading parameters...
 ja_train_path:	 /workspace/data/dataset_ja_train.json
 ja_dev_path:	 /workspace/data/dataset_ja_dev.json
 ja_test_path:	 /workspace/data/dataset_ja_test.json
 en_train_path:	 /workspace/data/dataset_en_train.json
 en_dev_path:	 /workspace/data/dataset_en_dev.json
 en_test_path:	 /workspace/data/dataset_en_test.json
 ja_vector_path:	 /workspace/amazon_review/weight/japanese_fasttext_vectors.vec
 is_developing:	 True
 source_category:	 home
 target_category:	 wireless
 target_ratio:	 0.5
 lambda:	 0.3
 use_pretrained_vector:	 False
 token_max_length:	 256
 batch_size:	 32
 emb_dim:	 300
 class_num:	 2
 criterion:	 CrossEntropyLoss
 lr:	 1e-05
 optimizer:	 Adam
 epochs:	 10
 trial_count:	 10


In [4]:
# データセット読み込み
train_df = pd.read_json(params["ja_train_path"], orient="record", lines=True)
if params["is_developing"]:
    train_df = train_df.sample(n=10000, random_state=1)
dev_df = pd.read_json(params["ja_dev_path"], orient="record", lines=True)
test_df = pd.read_json(params["ja_test_path"], orient="record", lines=True)

In [5]:
# sourceカテゴリーとtargetカテゴリーを分ける
train_source_df = train_df[train_df["product_category"] == params["source_category"]]
dev_source_df = dev_df[dev_df["product_category"] == params["source_category"]]
test_source_df = test_df[test_df["product_category"] == params["source_category"]]
train_target_df = train_df[train_df["product_category"] == params["target_category"]]
dev_target_df = dev_df[dev_df["product_category"] == params["target_category"]]
test_target_df = test_df[test_df["product_category"] == params["target_category"]]

In [6]:
# クラスラベル設定
for df in [train_source_df, dev_source_df, test_source_df, train_target_df, dev_target_df, test_target_df]:
    # 3以上かを予測する場合
    df["class"] = 0
    df["class"][df["stars"] > 3] = 1

    # 5クラス分類する場合
    # df["class"] = df["stars"] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [7]:
# トークン化
model_name = "cl-tohoku/bert-base-japanese-v2"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [8]:
# dataloader作成
train_source_dataloader = sentence_to_loader(
    train_source_df.review_body.values,
    train_source_df["class"].values,
    tokenizer,
    params["batch_size"],
    shuffle=True,
)
dev_source_dataloader = sentence_to_loader(
    dev_source_df.review_body.values, dev_source_df["class"].values, tokenizer, params["batch_size"], shuffle=False
)
# test_source_dataloader = sentence_to_loader(
#     test_source_df.review_body.values,
#     test_source_df["class"].values,
#     tokenizer,
#     params["batch_size"],
#     shuffle=False,
# )
train_target_dataloader = sentence_to_loader(
    train_target_df.review_body.values,
    train_target_df["class"].values,
    tokenizer,
    params["batch_size"],
    shuffle=True,
)
# dev_target_dataloader = sentence_to_loader(
#     dev_target_df.review_body.values, dev_target_df["class"].values, tokenizer, params["batch_size"], shuffle=False
# )
test_target_dataloader = sentence_to_loader(
    test_target_df.review_body.values,
    test_target_df["class"].values,
    tokenizer,
    params["batch_size"],
    shuffle=False,
)

In [9]:
# BERTモデル構築
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=params["class_num"],
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32768, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
# 最適化とスケジューラー
# 論文で推奨されているハイパーパラメータを使用
optimizer = torch.optim.AdamW(model.parameters(), lr=6e-6, eps=1e-8)
epochs = 3

In [11]:
# 訓練
for epoch in range(epochs):
    print(f"\n======== Epoch {epoch+1} / {epochs} ========\nTraining")

    total_train_loss = 0
    model.train()

    for step, (input_id_batch, input_mask_batch, label_batch) in tqdm(
        enumerate(train_source_dataloader), total=len(train_source_dataloader)
    ):
        input_id_batch = input_id_batch.to(device).to(torch.int64)
        input_mask_batch = input_mask_batch.to(device).to(torch.int64)
        label_batch = label_batch.to(device).to(torch.int64)

        model.zero_grad()
        result = model(input_id_batch, token_type_ids=None, attention_mask=input_mask_batch, labels=label_batch)
        total_train_loss += result.loss.item()
        result.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_source_dataloader)
    print(f"\n\tAverage training loss: {avg_train_loss:.2f}")

    # 検証データに対する予測
    print("\nRunning Validation")
    total_dev_loss = 0
    total_dev_accuracy = 0
    total_dev_f1 = 0
    model.eval()

    for step, (input_id_batch, input_mask_batch, label_batch) in tqdm(
        enumerate(dev_source_dataloader), total=len(dev_source_dataloader)
    ):
        input_id_batch = input_id_batch.to(device).to(torch.int64)
        input_mask_batch = input_mask_batch.to(device).to(torch.int64)
        label_batch = label_batch.to(device).to(torch.int64)

        with torch.no_grad():
            result = model(input_id_batch, token_type_ids=None, attention_mask=input_mask_batch, labels=label_batch)

        total_dev_loss += result.loss.item()
        logit_array = result.logits.detach().cpu().numpy()
        label_array = label_batch.cpu().numpy()
        total_dev_accuracy += calc_accuracy(label_array, logit_array)
        total_dev_f1 += calc_f1(label_array, logit_array)

    avg_dev_loss = total_dev_loss / len(dev_source_dataloader)
    print(f"\tDev Loss: {avg_dev_loss:.3f}")

    avg_dev_accuracy = total_dev_accuracy / len(dev_source_dataloader)
    print(f"\tAccuracy: {avg_dev_accuracy:.3f}")

    avg_dev_f1 = total_dev_f1 / len(dev_source_dataloader)
    print(f"\tF1: {avg_dev_f1:.3f}")

  0%|          | 0/229 [00:00<?, ?it/s]


Training


100%|██████████| 229/229 [01:34<00:00,  2.42it/s]
  1%|          | 1/112 [00:00<00:15,  7.02it/s]


	Average training loss: 0.68

Running Validation


100%|██████████| 112/112 [00:13<00:00,  8.14it/s]
  0%|          | 0/229 [00:00<?, ?it/s]

	Dev Loss: 0.644
	Accuracy: 63.393
	F1: 56.939

Training


100%|██████████| 229/229 [01:34<00:00,  2.42it/s]
  1%|          | 1/112 [00:00<00:16,  6.83it/s]


	Average training loss: 0.56

Running Validation


100%|██████████| 112/112 [00:13<00:00,  8.09it/s]
  0%|          | 0/229 [00:00<?, ?it/s]

	Dev Loss: 0.618
	Accuracy: 70.759
	F1: 55.944

Training


100%|██████████| 229/229 [01:34<00:00,  2.42it/s]
  1%|          | 1/112 [00:00<00:15,  7.07it/s]


	Average training loss: 0.45

Running Validation


100%|██████████| 112/112 [00:13<00:00,  8.11it/s]

	Dev Loss: 0.804
	Accuracy: 72.991
	F1: 62.730





In [12]:
# ブートストラップで複数回実行する
print("\ntargetでFineTuning開始")
# 事前学習したモデルを保持
# メモリを共有しないためにdeepcopyを使用する
model_pretrained = copy.deepcopy(model.cpu())


targetでFineTuning開始


In [13]:
params["target_ratio"] = [0.01, 0.05, 0.1, 0.3, 0.5]

for target_ratio in params["target_ratio"]:
    print("------------------------------")
    print(f"target_ratio = {target_ratio}")
    print("------------------------------")

    accuracy_list = []
    f1_list = []

    for count in range(params["trial_count"]):
        print(f"\n{count+1}回目の試行")

        # targetでFineTuningする準備
        # target_ratioで指定した比率までtargetのデータ数を減らす
        source_num = train_source_df.shape[0]
        target_num = int(source_num * target_ratio)
        if target_num > train_target_df.shape[0]:
            print("Target ratio is too large.")
            exit()
        train_target_df_sample = train_target_df.sample(target_num, replace=False)
        print(f"Source num: {source_num}, Target num: {target_num}")

        # targetのデータローダー作成
        train_target_dataloader = sentence_to_loader(
            train_target_df_sample.review_body.values,
            train_target_df_sample["class"].values,
            tokenizer,
            params["batch_size"],
            shuffle=True,
        )

        # 事前学習したモデルをロード
        model = copy.deepcopy(model_pretrained).to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=6e-6, eps=1e-8)

        # targetでFineTuning
        for epoch in range(epochs):
            print(f"======== Epoch {epoch+1} / {epochs} ========")

            total_train_loss = 0
            model.train()

            for step, (input_id_batch, input_mask_batch, label_batch) in enumerate(train_target_dataloader):
                input_id_batch = input_id_batch.to(device).to(torch.int64)
                input_mask_batch = input_mask_batch.to(device).to(torch.int64)
                label_batch = label_batch.to(device).to(torch.int64)

                model.zero_grad()
                result = model(
                    input_id_batch, token_type_ids=None, attention_mask=input_mask_batch, labels=label_batch
                )
                total_train_loss += result.loss.item()
                result.loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()

            avg_train_loss = total_train_loss / len(train_target_dataloader)
            print(f"Training Target Loss: {avg_train_loss:.2f}")

        # テスト
        total_test_loss = 0
        total_test_accuracy = 0
        total_test_f1 = 0
        model.eval()

        for step, (input_id_batch, input_mask_batch, label_batch) in enumerate(test_target_dataloader):
            input_id_batch = input_id_batch.to(device).to(torch.int64)
            input_mask_batch = input_mask_batch.to(device).to(torch.int64)
            label_batch = label_batch.to(device).to(torch.int64)

            with torch.no_grad():
                result = model(
                    input_id_batch, token_type_ids=None, attention_mask=input_mask_batch, labels=label_batch
                )

            total_test_loss += result.loss.item()
            logit_array = result.logits.detach().cpu().numpy()
            label_array = label_batch.cpu().numpy()
            total_test_accuracy += calc_accuracy(label_array, logit_array)
            total_test_f1 += calc_f1(label_array, logit_array)

        avg_test_loss = total_test_loss / len(test_target_dataloader)
        print(f"\nTest Target Loss: {avg_test_loss:.2f}")

        avg_test_accuracy = total_test_accuracy / len(test_target_dataloader)
        accuracy_list.append(avg_test_accuracy)
        print(f"Test Target Accuracy: {avg_test_accuracy:.2f}")

        avg_test_f1 = total_test_f1 / len(test_target_dataloader)
        f1_list.append(avg_test_f1)
        print(f"Test Target F1: {avg_test_f1:.2f}")

    accuracy_interval = stats.t.interval(
        alpha=0.95, df=len(accuracy_list) - 1, loc=np.mean(accuracy_list), scale=stats.sem(accuracy_list)
    )
    f1_interval = stats.t.interval(alpha=0.95, df=len(f1_list) - 1, loc=np.mean(f1_list), scale=stats.sem(f1_list))
    print("\n\t\tMean, Std, 95% interval (bottom, up)")
    print(
        f"Accuracy\t{np.mean(accuracy_list):.2f}, {np.std(accuracy_list, ddof=1):.2f}, {accuracy_interval[0]:.2f}, {accuracy_interval[1]:.2f}"
    )
    print(
        f"F1 Score\t{np.mean(f1_list):.2f}, {np.std(f1_list, ddof=1):.2f}, {f1_interval[0]:.2f}, {f1_interval[1]:.2f}"
    )

------------------------------
target_ratio = 0.01
------------------------------

1回目の試行
Source num: 915, Target num: 9
Training Target Loss: 1.22
Training Target Loss: 0.18
Training Target Loss: 0.13

Test Target Loss: 0.99
Test Target Accuracy: 60.54
Test Target F1: 45.68

2回目の試行
Source num: 915, Target num: 9
Training Target Loss: 1.04
Training Target Loss: 0.35
Training Target Loss: 0.20

Test Target Loss: 0.78
Test Target Accuracy: 67.89
Test Target F1: 51.50

3回目の試行
Source num: 915, Target num: 9
Training Target Loss: 1.84
Training Target Loss: 0.38
Training Target Loss: 0.24

Test Target Loss: 1.00
Test Target Accuracy: 59.31
Test Target F1: 45.96

4回目の試行
Source num: 915, Target num: 9
Training Target Loss: 0.70
Training Target Loss: 0.78
Training Target Loss: 0.16

Test Target Loss: 0.87
Test Target Accuracy: 66.18
Test Target F1: 49.84

5回目の試行
Source num: 915, Target num: 9
Training Target Loss: 0.06
Training Target Loss: 0.03
Training Target Loss: 0.02

Test Target Loss: 0.9

KeyboardInterrupt: 