In [1]:
import re
import spacy
import pandas as pd
import torch
from torch import nn

df = pd.read_csv("./id.csv", index_col=0)
vocab_dim = df["ID"].max()

nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words


# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)

In [2]:
import os
import numpy as np
from dotenv import load_dotenv
from gensim.models import KeyedVectors

load_dotenv()
FILE_DIR = os.getenv('FILE_DIR')
model = KeyedVectors.load_word2vec_format(f"{FILE_DIR}/GoogleNews-vectors-negative300.bin", binary=True)

VOCAB_SIZE = vocab_dim + 2
EMB_SIZE = 300

weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0

weights.shape

target = df[df["COUNT"] > 1].index.tolist()

for i, word in enumerate(target):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1
  except KeyError:
    weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))
  
weights = torch.from_numpy(weights.astype((np.float32)))

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')

学習済みベクトル利用単語数: 6790 / 8195


In [3]:
from torch.utils import data

class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
    単語ベクトルの平均をまとめたテンソル
  y : テンソル
    カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
    学習か訓練かを設定する
  """
  
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]

def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1
  
def collate_fn(batch):
  sequences = [x[0] for x in batch]
  labels = torch.LongTensor([x[1] for x in batch])
  x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
  return x, labels


data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

batch_size = 64

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

In [10]:
from torch import optim


class MyRNN(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_size, padding_idx, emb_weights, num_layers=3):
    super(MyRNN, self).__init__()

    self.embeddings = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(hidden_dim * 2, output_size)

  def forward(self, x):
    x = self.embeddings(x)
    x, h_T = self.rnn(x)
    x = x[:, -1, :]
    x = self.fc(x)
    pred = torch.softmax(x, dim=-1)
    return pred
  
PADDING_SIZE = vocab_dim + 1
net = MyRNN(300, 50, 4, PADDING_SIZE, weights)
net.apply(init_weights)

net.train()
loss_fn = nn.CrossEntropyLoss()
op = optim.SGD(net.parameters(), lr=0.1)

In [11]:
from tqdm import tqdm

epochs = 5

for i in range(epochs):
  print("------")
  print(f"Epoch {i}/{epochs}")
  
  for phase in ["train", "val"]:
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    if phase == "train":
      net.train()
    else:
      net.eval()

    for data, label in tqdm(dataloader[phase]):
      op.zero_grad()

      with torch.set_grad_enabled(phase == "train"):
        outputs = net(data)
        loss = loss_fn(outputs, label)
        _, pred = torch.max(outputs, 1)

        if phase == "train":
          loss.backward()
          op.step()

        epoch_loss += loss.item() * data.size(0)
        epoch_acc += torch.sum(pred == label.data)
    
    size = len(dataloader[phase].dataset)
    print(f"train loss: {epoch_loss / size}, acc: {epoch_acc / size}")

------
Epoch 0/5


100%|██████████| 167/167 [01:36<00:00,  1.72it/s]


train loss: 1.2363731310368478, acc: 0.4927929639816284


100%|██████████| 21/21 [00:12<00:00,  1.66it/s]


train loss: 1.196171892617277, acc: 0.535179615020752
------
Epoch 1/5


100%|██████████| 167/167 [01:33<00:00,  1.78it/s]


train loss: 1.1792436732374654, acc: 0.5571883320808411


100%|██████████| 21/21 [00:12<00:00,  1.74it/s]


train loss: 1.1301102681074313, acc: 0.6077844500541687
------
Epoch 2/5


100%|██████████| 167/167 [01:31<00:00,  1.82it/s]


train loss: 1.1452478590372215, acc: 0.5930362939834595


100%|██████████| 21/21 [00:10<00:00,  1.93it/s]


train loss: 1.1118580859578298, acc: 0.6272454857826233
------
Epoch 3/5


100%|██████████| 167/167 [01:31<00:00,  1.83it/s]


train loss: 1.1242155403021523, acc: 0.6134406328201294


100%|██████████| 21/21 [00:10<00:00,  1.92it/s]


train loss: 1.136133408831979, acc: 0.602544903755188
------
Epoch 4/5


100%|██████████| 167/167 [01:30<00:00,  1.84it/s]


train loss: 1.1097684358666784, acc: 0.6298202872276306


100%|██████████| 21/21 [00:10<00:00,  1.94it/s]

train loss: 1.1086364791778747, acc: 0.632485032081604



