<a href="https://colab.research.google.com/github/t-nakatani/signate_stu22/blob/main/signate_stu_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[KaggleタイタニックのNameだけで予測精度80%超えた話(BERT/TF2.0)](https://qiita.com/pocokhc/items/56273f40f57679f25341)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.colab import drive
drive.mount('/content/drive')
ROOT = "/content/drive/MyDrive/Colab Notebooks/signate/"

#学習用データと評価用データの読み込み
train = pd.read_csv(os.path.join(ROOT, "train_cleaned.csv"))
test = pd.read_csv(os.path.join(ROOT, "test_cleaned.csv"))

jobdic = dict(zip([1, 2, 3, 4], ['DataScientist', 'ML Engineer', 'Software Engineer', 'Consultant']))

Mounted at /content/drive


In [2]:
!pip install -q transformers==3

[K     |████████████████████████████████| 754 kB 7.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 42.5 MB/s 
[K     |████████████████████████████████| 3.0 MB 35.7 MB/s 
[K     |████████████████████████████████| 880 kB 53.8 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [3]:
import numpy as np
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaModel
from torch import optim
from torch import cuda
import time
from matplotlib import pyplot as plt
from tqdm import tqdm

In [4]:
# Datasetの定義
class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.X)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.X[index]
    inputs = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
      truncation=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']
    labels = self.y[index]

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(labels)
    }

In [5]:
# BERT分類モデルの定義
class BERTClass(torch.nn.Module):
  def __init__(self, drop_rate, otuput_size):
    super().__init__()
    self.bert = RobertaModel.from_pretrained('roberta-base')
    self.drop = torch.nn.Dropout(drop_rate)
    self.fc = torch.nn.Linear(768, otuput_size)  # BERTの出力に合わせて768次元を指定

  def forward(self, ids, mask):
    _, out = self.bert(ids, attention_mask=mask)
    out = self.fc(self.drop(out))
    return out

In [6]:
def calculate_loss_and_accuracy(model, criterion, loader, device):
  """ 損失・正解率を計算"""
  model.eval()
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in tqdm(loader):

      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)


      outputs = model(ids, mask)
      loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1).cpu().numpy() # バッチサイズの長さの予測ラベル配列
      labels = torch.argmax(labels, dim=-1).cpu().numpy()  # バッチサイズの長さの正解ラベル配列
      total += len(labels)
      correct += (pred == labels).sum().item()

  return loss / len(loader), correct / total

In [7]:
DROP_RATE = 0.1
OUTPUT_SIZE = 4
BATCH_SIZE = 16
NUM_EPOCHS = 6
LEARNING_RATE = 1e-6
VERSION = 'v5'

In [8]:
import sklearn.metrics
def train_model(
        df_train,       # 学習用のデータ
        text_column,    # 対象のカラム名
        target_column,  # 目的変数のカラム名
        df_valid=None,  # 検証用データ
        df_val_test=None,       # 予測用データ
        model_file_prefix="",  # 保存時のファイル名識別子
        epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
    ):
  
    # model
    model_path = "{}_{}.pth".format(model_file_prefix, VERSION)
    model = BERTClass(DROP_RATE, OUTPUT_SIZE)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)
    device = 'cuda' if cuda.is_available() else 'cpu'

    # dataset
    df_val_test[target_column].fillna(0)

    max_len = 400
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    dataset_train = CreateDataset(df_train[text_column].values, pd.get_dummies(df_train[target_column]).values, tokenizer, max_len)
    dataset_val = CreateDataset(df_valid[text_column].values, pd.get_dummies(df_valid[target_column]).values, tokenizer, max_len)
    dataset_test = CreateDataset(df_val_test[text_column].values, pd.get_dummies(df_val_test[target_column]).values, tokenizer, max_len)

    # train
    print('\n================  start train  ================')
    best_acc = 0
    model = BERTClass(DROP_RATE, OUTPUT_SIZE).to(device)

    dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
    dataloader_valid = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=True)
    dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False)


    log_train = []
    log_valid = []
    for epoch in range(NUM_EPOCHS):
      s_time = time.time() # 開始時刻の記録
      model.train()

      for data in tqdm(dataloader_train):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        labels = data['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(ids, mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

      loss_train, acc_train = calculate_loss_and_accuracy(model, criterion, dataloader_train, device)
      loss_valid, acc_valid = calculate_loss_and_accuracy(model, criterion, dataloader_valid, device)
      log_train.append([loss_train, acc_train])
      log_valid.append([loss_valid, acc_valid])
      if best_acc < acc_valid:
        best_acc = acc_valid
        torch.save(model, os.path.join(ROOT, model_path))
        print(f'best-model saved (epoch: {epoch + 1})')

      e_time = time.time() # 終了時刻の記録
      print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f} ({(e_time - s_time):.4f}sec)')
    metric = log_valid
    print('================  end train  ================\n')

    # predict
    model = torch.load(os.path.join(ROOT, model_path))
    model.eval()
    pred_y_list = []
    emb_list = []
    with torch.no_grad():
      for data in tqdm(dataloader_test):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        labels = data['labels'].to(device)

        output = model.forward(ids, mask)
        pred = torch.argmax(output, dim=-1).cpu().numpy()

        pred_y_list.extend(pred)
        emb_list.extend(output.cpu().numpy())
    return metric, pred_y_list, emb_list

In [9]:
import sklearn.model_selection

text_column, target_column, n_splits = ('description', 'jobflag', 3)

df = pd.concat([train, test], ignore_index=True, sort=False)
df_train = df[df[target_column].notnull()]
df_test = df[df[target_column].isnull()]

df_train_idx = df_train.index

# store result
df_pred = pd.DataFrame(df.index, columns=["index"]).set_index("index")
df_emb = pd.DataFrame(df.index, columns=["index"]).set_index("index")
df_emb_pred = None
metric_list = []

# cross validation
kf = sklearn.model_selection.StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1234)
for i, (train_idx, test_idx) in enumerate(kf.split(df_train, df_train[target_column])):
  df_train_sub = df_train.iloc[train_idx]
  df_test_sub = df_train.iloc[test_idx]

  df_val_test = pd.concat([df_test_sub, df_test], ignore_index=True, sort=False)

  model_file_prefix = "cv_{}".format(i)

  # train
  metric, pred_y_list, emb_list = train_model(
      df_train=df_train_sub, 
      text_column=text_column,
      target_column=target_column, 
      df_valid=df_test_sub,
      df_val_test=df_val_test,
      model_file_prefix=model_file_prefix,
  )
  metric_list.append(metric)
  
  result_name = "result_{}".format(i)
  df_pred.loc[df_train_idx[test_idx], result_name] = pred_y_list[:len(test_idx)]
  df_pred.loc[df_test.index, result_name] = pred_y_list[len(test_idx):]

  a = pd.DataFrame(emb_list[:len(test_idx)], index=df_train_idx[test_idx])
  df_emb = df_emb.combine_first(a)

  if df_emb_pred is None:
      df_emb_pred = pd.DataFrame(emb_list[len(test_idx):], index=df_test.index)
  else:
      df_emb_pred += emb_list[len(test_idx):]


  pred_y = df_pred.mean(axis=1)

  df_emb_pred /= n_splits
  df_emb = df_emb.combine_first(df_emb_pred)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]




100%|██████████| 64/64 [00:42<00:00,  1.52it/s]
100%|██████████| 64/64 [00:13<00:00,  4.63it/s]
100%|██████████| 32/32 [00:06<00:00,  4.61it/s]


best-model saved (epoch: 1)
epoch: 1, loss_train: 1.4137, accuracy_train: 0.0832, loss_valid: 1.4127, accuracy_valid: 0.0771 (72.8189sec)


100%|██████████| 64/64 [00:39<00:00,  1.62it/s]
100%|██████████| 64/64 [00:13<00:00,  4.63it/s]
100%|██████████| 32/32 [00:06<00:00,  4.63it/s]


epoch: 2, loss_train: 1.4127, accuracy_train: 0.0832, loss_valid: 1.4133, accuracy_valid: 0.0771 (60.2023sec)


 53%|█████▎    | 34/64 [00:21<00:18,  1.60it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x7fd2e28e7550>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 1210, in __iter__
    self.close()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 1316, in close
    self.display(pos=0)
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 1509, in display
    self.sp(self.__str__() if msg is None else msg)
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 350, in print_status
    fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
  File "/usr/local/lib/python3.7/dist-packages/tqdm/std.py", line 344, in fp_write
    fp_flush()
  File "/usr/local/lib/python3.7/dist-packages/tqdm/utils.py", line 145, in inner
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/iostream.py", line 341, in flush
    self.pub_thread.schedule(self._flush)
  File "/us

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-cbdc6c6d5df6>", line 34, in <module>
    model_file_prefix=model_file_prefix,
  File "<ipython-input-8-bec056adf2db>", line 52, in train_model
    outputs = model(ids, mask)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "<ipython-input-5-cd20ed7c89a7>", line 10, in forward
    _, out = self.bert(ids, attention_mask=mask)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/transformers/modeling_bert.py", line 753, in forward
    input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds


KeyboardInterrupt: ignored

In [None]:
df_emb_pred += pd.DataFrame(emb_list[len(test_idx):])

In [None]:
df_emb_pred.dropna(how='any')

In [None]:
len(emb_list[len(test_idx):])

In [None]:
127*16

In [None]:
152*16

In [10]:
from sklearn.model_selection import train_test_split

y = pd.get_dummies(train.jobflag).values
# y_test = pd.get_dummies(test.jobflag).values

X_train, X_val, y_train, y_val = train_test_split(train['description'].values, y, test_size=0.2,random_state=109) 

# Datasetの作成
max_len = 400
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
dataset_train = CreateDataset(X_train, y_train, tokenizer, max_len)
dataset_val = CreateDataset(X_val, y_val, tokenizer, max_len)
dataset_test = CreateDataset(test['description'], [-1]*len(test['description']), tokenizer, max_len)

for var in dataset_train[0]:
  print(f'{var}: {dataset_train[0][var]}')

ids: tensor([    0, 49628,   877,    19,  5154,  1956,   474,  2567,  1521, 10516,
         5497,  5154,  1767, 20420,   877, 10691,  5497,  2390,  3827, 44542,
        35817,   670,  4358,  2417,   258,  1982, 14580,  1313,  2472, 28094,
         6784, 24989,  5941,  5530,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,   

In [11]:
# パラメータの設定
DROP_RATE = 0.1
OUTPUT_SIZE = 4
BATCH_SIZE = 16
NUM_EPOCHS = 10
LEARNING_RATE = 1e-6

# モデルの定義
model = BERTClass(DROP_RATE, OUTPUT_SIZE)

# 損失関数の定義
criterion = torch.nn.CrossEntropyLoss()

# オプティマイザの定義
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

# デバイスの指定
device = 'cuda' if cuda.is_available() else 'cpu'

In [15]:
def train__():
  best_acc = 0
  model.to(device)

  # dataloaderの作成
  dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)
  dataloader_valid = DataLoader(dataset_val, batch_size=BATCH_SIZE, shuffle=False)

  # 学習
  log_train = []
  log_valid = []
  for epoch in range(NUM_EPOCHS):
    # 開始時刻の記録
    s_time = time.time()


    model.train()
    for data in tqdm(dataloader_train):
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)

      optimizer.zero_grad()

      outputs = model(ids, mask)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    # 損失と正解率の算出
    loss_train, acc_train = calculate_loss_and_accuracy(model, criterion, dataloader_train, device)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, criterion, dataloader_valid, device)
    log_train.append([loss_train, acc_train])
    log_valid.append([loss_valid, acc_valid])
    if best_acc < acc_valid:
      best_acc = acc_valid
      
      weight_name = "v5_test.pth"
      torch.save(model, os.path.join(ROOT, weight_name))
      print(f'best-model saved (epoch: {epoch + 1})')

    # 終了時刻の記録
    e_time = time.time()

    # ログを出力
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}, {(e_time - s_time):.4f}sec') 


In [16]:
train__()

100%|██████████| 76/76 [00:49<00:00,  1.53it/s]
100%|██████████| 76/76 [00:16<00:00,  4.56it/s]
100%|██████████| 19/19 [00:04<00:00,  4.57it/s]


best-model saved (epoch: 1)
epoch: 1, loss_train: 0.8114, accuracy_train: 0.7153, loss_valid: 0.8362, accuracy_valid: 0.6875, 72.8426sec


100%|██████████| 76/76 [00:49<00:00,  1.53it/s]
100%|██████████| 76/76 [00:16<00:00,  4.60it/s]
100%|██████████| 19/19 [00:04<00:00,  4.57it/s]


best-model saved (epoch: 2)
epoch: 2, loss_train: 0.7294, accuracy_train: 0.7467, loss_valid: 0.7842, accuracy_valid: 0.7072, 72.3325sec


100%|██████████| 76/76 [00:49<00:00,  1.53it/s]
100%|██████████| 76/76 [00:16<00:00,  4.60it/s]
100%|██████████| 19/19 [00:04<00:00,  4.57it/s]


best-model saved (epoch: 3)
epoch: 3, loss_train: 0.6673, accuracy_train: 0.7665, loss_valid: 0.7554, accuracy_valid: 0.7105, 72.3127sec


  3%|▎         | 2/76 [00:01<01:11,  1.04it/s]


KeyboardInterrupt: ignored