<a href="https://colab.research.google.com/github/t-nakatani/signate_stu22/blob/main/signate_stu_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[PyTorch Lightning の API を勉強しよう](https://qiita.com/ground0state/items/c1d705ca2ee329cdfae4)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from google.colab import drive
drive.mount('/content/drive')
ROOT = "/content/drive/MyDrive/Colab Notebooks/signate/"

#学習用データと評価用データの読み込み
train = pd.read_csv(os.path.join(ROOT, "train_cleaned.csv"))
test = pd.read_csv(os.path.join(ROOT, "test_cleaned.csv"))

jobdic = dict(zip([1, 2, 3, 4], ['DataScientist', 'ML Engineer', 'Software Engineer', 'Consultant']))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -q transformers==3
!pip install -q pytorch-lightning

In [3]:
import numpy as np
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from transformers import RobertaTokenizerFast, RobertaModel
from torch import optim
from torch import cuda
import time
from matplotlib import pyplot as plt
from tqdm import tqdm

In [4]:
cfg = {
    'base_model': 'roberta-base',
    'num_class': 4,
    'drop_rate': 0.1,
    'max_length': 400,
    'batch_size': 16,
    'epochs': 8,
    'learning_rate': 1e-06,
    'device': 'cuda' if cuda.is_available() else 'cpu'}

In [5]:
# Datasetの定義
class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer, max_len):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.X[index]
    inputs = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
      truncation=True
    )
    ids = inputs['input_ids']
    mask = inputs['attention_mask']

    return {
      'ids': torch.LongTensor(ids),
      'mask': torch.LongTensor(mask),
      'labels': torch.Tensor(self.y[index])
    }

In [6]:
from sklearn.model_selection import train_test_split

y = pd.get_dummies(train.jobflag).values
# y_test = pd.get_dummies(test.jobflag).values

X_train, X_val, y_train, y_val = train_test_split(train['description'].values, y, test_size=0.2,random_state=109) 

# Datasetの作成
max_len = cfg['max_length']
tokenizer = RobertaTokenizerFast.from_pretrained(cfg['base_model'])
dataset_train = CreateDataset(X_train, y_train, tokenizer, max_len)
dataset_val = CreateDataset(X_val, y_val, tokenizer, max_len)
# dataset_test = CreateDataset(test['description'], [-1]*len(test['description']), tokenizer, max_len)

for var in dataset_train[0]:
  print(f'{var}: {dataset_train[0][var]}')

ids: tensor([    0, 49628,   877,    19,  5154,  1956,   474,  2567,  1521, 10516,
         5497,  5154,  1767, 20420,   877, 10691,  5497,  2390,  3827, 44542,
        35817,   670,  4358,  2417,   258,  1982, 14580,  1313,  2472, 28094,
         6784, 24989,  5941,  5530,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,   

In [8]:
train_cfg = {
    'batch_size': cfg['batch_size'],
    'shuffle': True,
    'num_workers': 1,
    'pin_memory':True
}

val_cfg = {'batch_size': cfg['batch_size'],
    'shuffle': False,
    'num_workers': 1,
    'pin_memory':True
}

training_loader = DataLoader(dataset_train, **train_cfg)
val_loader = DataLoader(dataset_val, **val_cfg)

In [9]:
# BERT分類モデルの定義
class BERTClass(pl.LightningModule):
  def __init__(self, drop_rate, otuput_size):
    super().__init__()
    self.bert = RobertaModel.from_pretrained(cfg['base_model'])
    self.drop = torch.nn.Dropout(drop_rate)
    self.fc = torch.nn.Linear(768, otuput_size)  # BERTの出力に合わせて768次元を指定

  def forward(self, ids, mask):
    _, out = self.bert(ids, attention_mask=mask)
    out = self.fc(self.drop(out))
    return out

  def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        ids = batch['ids']
        mask = batch['mask']
        labels = batch['labels']
        outputs = model(ids, mask)

        loss = torch.nn.CrossEntropyLoss(outputs, labels)
        # Logging to TensorBoard by default
        self.log('train_loss', loss)
        return loss

  def configure_optimizers(self):
      optimizer = torch.optim.AdamW(self.parameters(), lr=cfg['learning_rate'])
      return optimizer

In [10]:
def calculate_loss_and_accuracy(model, criterion, loader, device):
  """ 損失・正解率を計算"""
  model.eval()
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in tqdm(loader):
      # デバイスの指定
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)
      outputs = model(ids, mask)

      loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1).cpu().numpy() # バッチサイズの長さの予測ラベル配列
      labels = torch.argmax(labels, dim=-1).cpu().numpy()  # バッチサイズの長さの正解ラベル配列
      total += len(labels)
      correct += (pred == labels).sum().item()

  return loss / len(loader), correct / total

In [11]:
device = cfg['device']
model = BERTClass(cfg['drop_rate'], cfg['num_class']).to(device)

In [12]:
trainer = pl.Trainer()
trainer.fit(model, training_loader, val_loader)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
Missing logger folder: /content/lightning_logs

  | Name | Type         | Params
--------------------------------------
0 | bert | RobertaModel | 124 M 
1 | drop | Dropout      | 0     
2 | fc   | Linear       | 3.1 K 
--------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.595   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

RuntimeError: ignored

In [None]:
# 正解率の算出
def predict(model, dataset, device):
  # Dataloaderの作成
  loader = DataLoader(dataset, batch_size=8, shuffle=False)

  model.eval()
  total = 0
  preds = []
  with torch.no_grad():
    for data in tqdm(loader):
      # デバイスの指定
      ids = data['ids'].to(device)
      mask = data['mask'].to(device)
      labels = data['labels'].to(device)

      # 順伝播 + 予測値の取得 + 正解数のカウント
      outputs = model.forward(ids, mask)
      pred = torch.argmax(outputs, dim=-1).cpu().numpy()
      preds.append(pred)

  return preds
dataset_test = CreateDataset(test['description'], [0]*len(test['description']), tokenizer, max_len)
model = torch.load(os.path.join(ROOT, 'v3_best.pth'))
preds = predict(model, dataset_test, device)

In [None]:
flatten = [p+1 for arr in preds for p in arr]
df_submit = pd.DataFrame([[id, pred] for id, pred in zip(test.id, flatten)])
df_submit.to_csv(os.path.join(ROOT, 'v3_submit.csv'), index=None, header=None)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
for dataset_, y_ in zip([dataset_train, dataset_val], [y_train, y_val]):
  preds_ = predict(model, dataset_, device)
  preds_ = [p+1 for arr in preds_ for p in arr]
  labels = [np.argmax(y)+1 for y in y_]
  print('\n', confusion_matrix(preds_, labels))
  print("Classification Report: ")
  print(classification_report(labels, preds_))