In [1]:
!pwd
%cd drive/MyDrive/nlp100/chapter09

/content
/content/drive/MyDrive/nlp100/chapter09


In [2]:
# knock50

import pandas as pd
from sklearn.model_selection import train_test_split
# FORMAT: ID \t TITLE \t URL \t PUBLISHER \t CATEGORY \t STORY \t HOSTNAME \t TIMESTAMP
df = pd.read_csv("newsCorpora.csv", sep="\t", header=None, names=["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"])

# 該当するpublisherの記事を抽出する
publishers = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']
df = df[df['PUBLISHER'].isin(publishers)]
# TITLEとCATEGORYのみ抽出
df = df[["TITLE", "CATEGORY"]]

#データを分割しシャッフルする
train, test = train_test_split(df, test_size=0.2, shuffle=True)
test, valid = train_test_split(test, test_size=0.5, shuffle=True)

#ファイルに保存する
train.to_csv("train.txt", sep="\t", index=False, header=None)
valid.to_csv("valid.txt", sep="\t", index=False, header=None)
test.to_csv("test.txt", sep="\t", index=False, header=None)

print("train\n", train["CATEGORY"].value_counts())
print("valid\n", valid["CATEGORY"].value_counts())
print("test\n", test["CATEGORY"].value_counts())

train
 b    4456
e    4253
t    1227
m     736
Name: CATEGORY, dtype: int64
valid
 b    591
e    496
t    153
m     94
Name: CATEGORY, dtype: int64
test
 b    580
e    530
t    144
m     80
Name: CATEGORY, dtype: int64


In [3]:
# knock80
from collections import defaultdict
import string

# 頻度を数える
d = defaultdict(int) # 初期値を0にする
table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) # 記号をスペースに置換する変換を記述
for text in train['TITLE']:
  for word in text.translate(table).split(): # スペースに変換する操作を実行し、区切って単語を格納。単語を一つ一つ見ていく
    d[word] += 1
d = sorted(d.items(), key=lambda x:x[1], reverse=True) # ソートする

# ID辞書を作成
id_dict = {word : i+1 for i, (word, cnt) in enumerate(d) if cnt > 1} # 出現頻度が高い順にidを振っていく

def tokenizer(text, id_dict=id_dict, unk=0):
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  return [id_dict.get(word, unk) for word in text.translate(table).split()] # .get()を使うことで、keyに値が存在しない場合もエラーが出ず、初期値(0に設定してある)を取得する

text = train.iloc[1, train.columns.get_loc('TITLE')]
print(text)
print(tokenizer(text))

Spanish Bonds Rise With Italy's as Month-High Yield Lures Buyers
[1159, 351, 207, 21, 2088, 2, 6, 213, 161, 2089, 5200, 3545]


In [21]:
# knock81
import torch
from torch import nn
from torch.utils.data import Dataset
# RNNの作成
# モデルの構築
import random
import torch
from torch import nn
import torch.utils.data as data

class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        self.rnn = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h0=None):
        x = self.emb(x)
        x, h = self.rnn(x, h0)
        x = x[:, -1, :]
        logits = self.fc(x)
        return logits

# パラメータの設定
VOCAB_SIZE = len(set(id_dict.values())) + 2  # 辞書のID数 + unknown + パディングID
EMB_SIZE = 300
PADDING_IDX = len(set(id_dict.values())) + 1
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 1

# モデルの定義
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS)

text = train.iloc[1, train.columns.get_loc('TITLE')]
x = torch.tensor([tokenizer(text)], dtype=torch.int64)
print(x)
print(x.size())
print(nn.Softmax(dim=-1)(model(x)))

tensor([[4228,   56,  196,  192,    1,  235,  367,    1, 2343,  298]])
torch.Size([1, 10])
tensor([[0.2786, 0.2446, 0.2233, 0.2534]], grad_fn=<SoftmaxBackward0>)


In [23]:
print(train['TITLE'])

406661    Nasa's 2020 Mars Rover mission revealed: A dev...
47892             NYT CEO: We Have to Get Back to Ad Growth
11856     Keith Richards' Children's Book Inspired By Hi...
299697    Pregnant Kourtney Kardashian flashes her midri...
405264    UPDATE 2-Kerry presses India on global trade d...
                                ...                        
300896    UPDATE 1-Britain's cost agency not ready to ba...
316902    White House: no change to US policy on crude o...
363818    Russia's Lavrov to talk South Stream pipeline ...
401048    COLUMN-Fed to widen Main St/Wall St gap: James...
137983    Home > Prince > Prince Re-signs With Warner Br...
Name: TITLE, Length: 10672, dtype: object


In [24]:
# knock82

# カテゴリ名を数字に変更
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
Y_train = torch.from_numpy(train['CATEGORY'].map(category_dict).values)
Y_valid = torch.from_numpy(valid['CATEGORY'].map(category_dict).values)
Y_test = torch.from_numpy(test['CATEGORY'].map(category_dict).values)

class NewsDataset(data.Dataset):
    def __init__(self, X, y, phase='train'):
        self.X = X['TITLE']
        self.y = y
        self.phase = phase

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # idxに対応するデータとラベルを取得
        inputs = torch.tensor(tokenizer(self.X.values[idx]))
        return inputs, self.y[idx]

train_dataset = NewsDataset(train, Y_train, phase='train')
valid_dataset = NewsDataset(valid, Y_valid, phase='val')
test_dataset = NewsDataset(test, Y_test, phase='val')
idx = 0
print(train_dataset.__getitem__(idx)[0].size())
print(train_dataset.__getitem__(idx)[1])
print(valid_dataset.__getitem__(idx)[0].size())
print(valid_dataset.__getitem__(idx)[1])
print(test_dataset.__getitem__(idx)[0].size())
print(test_dataset.__getitem__(idx)[1])

torch.Size([13])
tensor(1)
torch.Size([10])
tensor(2)
torch.Size([11])
tensor(2)


In [4]:
# knock81
import torch
from torch import nn

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    self.batch_size = x.size()[0]
    hidden = self.init_hidden(x.device)  # h0のゼロベクトルを作成
    emb = self.emb(x)
    # emb.size() = (batch_size, seq_len, emb_size)
    out, hidden = self.rnn(emb, hidden)
    # out.size() = (batch_size, seq_len, hidden_size)
    out = self.fc(out[:, -1, :])
    # out.size() = (batch_size, output_size)
    return out

  def init_hidden(self, device):
    hidden = torch.zeros(1, self.batch_size, self.hidden_size, device=device)
    return hidden

from torch.utils.data import Dataset

class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.X.values[index]
    inputs = self.tokenizer(text)

    return {
      'inputs': torch.tensor(inputs, dtype=torch.int64),
      'labels': torch.tensor(self.y[index], dtype=torch.int64)
    }

# ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = train['CATEGORY'].map(lambda x: category_dict[x]).values
y_valid = valid['CATEGORY'].map(lambda x: category_dict[x]).values
y_test = test['CATEGORY'].map(lambda x: category_dict[x]).values

# Datasetの作成
dataset_train = CreateDataset(train['TITLE'], y_train, tokenizer)
dataset_valid = CreateDataset(valid['TITLE'], y_valid, tokenizer)
dataset_test = CreateDataset(test['TITLE'], y_test, tokenizer)

# パラメータの設定
VOCAB_SIZE = len(set(id_dict.values())) + 1  # 辞書のID数 + パディングID
EMB_SIZE = 300
PADDING_IDX = len(set(id_dict.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

# モデルの定義
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE)

# 先頭10件の予測値取得
for i in range(10):
  X = dataset_train[i]['inputs']
  print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.1756, 0.2869, 0.3298, 0.2077]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2283, 0.1276, 0.2835, 0.3606]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1995, 0.3004, 0.2377, 0.2624]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2422, 0.1170, 0.3871, 0.2537]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3368, 0.3066, 0.1375, 0.2191]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2198, 0.2599, 0.1523, 0.3680]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3548, 0.1676, 0.1060, 0.3715]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3118, 0.2248, 0.2427, 0.2206]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2766, 0.2314, 0.2090, 0.2830]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3157, 0.2440, 0.2217, 0.2186]], grad_fn=<SoftmaxBackward0>)


In [None]:
# knock82

from torch.utils.data import DataLoader
import time
from torch import optim

def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  """損失・正解率を計算"""
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in dataloader:
      # デバイスの指定
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)

      # 順伝播
      outputs = model(inputs)

      # 損失計算
      if criterion != None:
        loss += criterion(outputs, labels).item()

      # 正解率計算
      pred = torch.argmax(outputs, dim=-1)
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return loss / len(dataset), correct / total


def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
  # モデルの学習を実行し、損失・正解率のログを返す
  # デバイスの指定
  model.to(device)

  # dataloaderの作成
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

  # スケジューラの設定
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

  # 学習
  log_train = []
  log_valid = []
  for epoch in range(num_epochs):
    # 開始時刻の記録
    s_time = time.time()

    # 訓練モードに設定
    model.train()
    for data in dataloader_train:
      # 勾配をゼロで初期化
      optimizer.zero_grad()

      # 順伝播 + 誤差逆伝播 + 重み更新
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    # 評価モードに設定
    model.eval()

    # 損失と正解率の算出
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)
    log_train.append([loss_train, acc_train])
    log_valid.append([loss_valid, acc_valid])


    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')

    e_time = time.time()
    # ログを出力
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}, {(e_time - s_time):.4f}sec')

    # 検証データの損失が3エポック連続で低下しなかった場合は学習終了
    if epoch > 2 and log_valid[epoch - 3][0] <= log_valid[epoch - 2][0] <= log_valid[epoch - 1][0] <= log_valid[epoch][0]:
      break

    # スケジューラを1ステップ進める
    scheduler.step()

  return {'train': log_train, 'valid': log_valid}

import numpy as np
from matplotlib import pyplot as plt

def visualize_logs(log):
  fig, ax = plt.subplots(1, 2, figsize=(15, 5))
  ax[0].plot(np.array(log['train']).T[0], label='train')
  ax[0].plot(np.array(log['valid']).T[0], label='valid')
  ax[0].set_xlabel('epoch')
  ax[0].set_ylabel('loss')
  ax[0].legend()
  ax[1].plot(np.array(log['train']).T[1], label='train')
  ax[1].plot(np.array(log['valid']).T[1], label='valid')
  ax[1].set_xlabel('epoch')
  ax[1].set_ylabel('accuracy')
  ax[1].legend()
  plt.show()

# パラメータの設定
VOCAB_SIZE = len(set(id_dict.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(id_dict.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
LEARNING_RATE = 1e-3
BATCH_SIZE = 1
NUM_EPOCHS = 10

# モデルの定義
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE)

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

# オプティマイザの定義
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

# モデルの学習
log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS)

epoch: 1, loss_train: 1.1119, accuracy_train: 0.5183, loss_valid: 1.1287, accuracy_valid: 0.4948, 42.0904sec
