<a href="https://colab.research.google.com/github/tomonari-masada/course2022-nlp/blob/main/assignment05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 課題5

## (A) fasttextの単語埋め込みを使ったモデル

In [1]:
import time
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

np.random.seed(123)
torch.manual_seed(123)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
PATH = '/content/drive/MyDrive/2022Courses/nlp/imdb/'

texts = {}
labels = {}
for tag in ['train', 'test']:
  with open(f'{PATH}{tag}.npy', 'rb') as f:
    texts[tag] = np.load(f)
  with open(f'{PATH}{tag}_labels.npy', 'rb') as f:
    labels[tag] = np.load(f)

In [3]:
for tag in ['train', 'test']:
  texts[tag] = torch.tensor(texts[tag])
  labels[tag] = torch.tensor(labels[tag])

In [4]:
class MyDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, index):
    return self.X[index], self.y[index]

In [5]:
train_dataset = MyDataset(texts['train'], labels['train'])
test_dataset = MyDataset(texts['test'], labels['test'])

valid_size = len(train_dataset) // 5
train_size = len(train_dataset) - valid_size
split_train_, split_valid_ = random_split(train_dataset, [train_size, valid_size])

In [6]:
BATCH_SIZE = 128

train_loader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(split_valid_, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

### モデルを変更

In [7]:
class TextSentiment(nn.Module):
  def __init__(self, emsize, num_class, hdim=1024):
    super(TextSentiment, self).__init__()
    self.fc_in = nn.Linear(emsize, hdim)
    self.fc1 = nn.Linear(hdim, hdim)
    self.fc2 = nn.Linear(hdim, hdim)
    self.fc_out = nn.Linear(hdim, num_class)
    self.ln_in = nn.LayerNorm(emsize)
    self.ln1 = nn.LayerNorm(hdim)
    self.ln2 = nn.LayerNorm(hdim)
    self.ln_out = nn.LayerNorm(hdim)

  def forward(self, x):
    x = self.ln_in(x)
    x = self.fc_in(x)
    x = self.ln1(x)
    x = self.fc1(x)
    x = self.ln2(x)
    x = self.fc2(x)
    x = self.ln_out(x)
    x = self.fc_out(x)
    return x

In [8]:
EMSIZE = texts['train'].size(1) # 埋め込みベクトルの次元
NUM_CLASS = len(np.unique(labels['train'])) # クラスの個数

model = TextSentiment(EMSIZE, NUM_CLASS, hdim=2048).to(device)

In [9]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 50])

### 訓練を行なう関数
* lossの監視も追加。

In [10]:
def train(dataloader):
  model.train()
  total_loss = 0.0
  total_acc = 0.0
  total_count = 0
  for input, target in dataloader:
    input, target = input.to(device), target.to(device)
    output = model(input)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    total_loss += loss.item() * len(target)
    total_acc += (output.argmax(1) == target).sum().item()
    total_count += len(target) # 表示用の集計
  return total_acc / total_count, total_loss / total_count

### 評価を行なう関数
* lossの監視も追加。

In [11]:
def evaluate(dataloader):
  model.eval()
  total_loss = 0.0
  total_acc = 0.0
  total_count = 0
  for input, target in dataloader:
    with torch.no_grad():
      input, target = input.to(device), target.to(device)
      output = model(input)
      loss = criterion(output, target)
      total_loss += loss.item() * len(target)
      total_acc += (output.argmax(1) == target).sum().item()
      total_count += len(target)
  return total_acc / total_count, total_loss / total_count

### 訓練と評価の実施

In [12]:
EPOCHS = 100

best_val_loss = float('inf')
for epoch in range(1, EPOCHS + 1):
  epoch_start_time = time.time()
  train_acc, train_loss = train(train_loader)
  val_acc, val_loss = evaluate(valid_loader)
  if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(model.state_dict(), f"model.pt")
  scheduler.step()
  print(f'epoch {epoch:3d} || '
        f'lr={scheduler.get_last_lr()[0]:.3e} | '
        f'time: {time.time() - epoch_start_time:5.2f}s || '
        f'train loss {train_loss:.4f} | '
        f'train accuracy {train_acc:8.3f} || '
        f'val loss {val_loss:.4f} | '
        f'val accuracy {val_acc:8.3f}'
        )

epoch   1 || lr=1.000e-05 | time:  3.68s || train loss 0.5011 | train accuracy    0.756 || val loss 0.3902 | val accuracy    0.826
epoch   2 || lr=1.000e-05 | time:  1.17s || train loss 0.3826 | train accuracy    0.832 || val loss 0.3533 | val accuracy    0.846
epoch   3 || lr=1.000e-05 | time:  1.16s || train loss 0.3598 | train accuracy    0.844 || val loss 0.3485 | val accuracy    0.852
epoch   4 || lr=1.000e-05 | time:  1.17s || train loss 0.3565 | train accuracy    0.847 || val loss 0.3447 | val accuracy    0.851
epoch   5 || lr=1.000e-05 | time:  1.10s || train loss 0.3561 | train accuracy    0.846 || val loss 0.3499 | val accuracy    0.848
epoch   6 || lr=1.000e-05 | time:  1.09s || train loss 0.3458 | train accuracy    0.851 || val loss 0.3728 | val accuracy    0.838
epoch   7 || lr=1.000e-05 | time:  1.10s || train loss 0.3433 | train accuracy    0.853 || val loss 0.3560 | val accuracy    0.844
epoch   8 || lr=1.000e-05 | time:  1.08s || train loss 0.3422 | train accuracy    0

* ハイパーパラメータのチューニングが済んだら、テストセットで評価する。

In [13]:
model.load_state_dict(torch.load("model.pt"))
test_acc, _ = evaluate(test_loader)
print(f'test accuracy {test_acc:8.3f}')

test accuracy    0.857
