In [1]:
import pandas as pd

df = pd.read_csv("./id.csv", index_col=0)
df

Unnamed: 0,COUNT,ID
to,2860,1
in,1911,2
the,1594,3
of,1429,4
for,1345,5
...,...,...
humankind,1,0
sole,1,0
curator,1,0
healthkit,1,0


In [2]:
import re
import spacy
import torch
from torch import nn


nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words


# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

FILE_DIR = os.getenv('FILE_DIR')

from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(f"{FILE_DIR}/GoogleNews-vectors-negative300.bin", binary=True)

In [4]:
import numpy as np

vocab_dim = df["ID"].max()

VOCAB_SIZE = vocab_dim + 2
EMB_SIZE = 300

weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0

weights.shape

target = df[df["COUNT"] > 1].index.tolist()

for i, word in enumerate(target):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1
  except KeyError:
    weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))
  
weights = torch.from_numpy(weights.astype((np.float32)))

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')
print(weights.size())

学習済みベクトル利用単語数: 6790 / 8195
torch.Size([8195, 300])


In [7]:
from torch.utils import data

class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
    単語ベクトルの平均をまとめたテンソル
  y : テンソル
    カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
    学習か訓練かを設定する
  """
  
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]

def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1
  
def collate_fn(batch):
  sequences = [x[0] for x in batch]
  labels = torch.LongTensor([x[1] for x in batch])
  x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
  return x, labels

In [9]:
data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

batch_size = 64

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

In [14]:
class MyRNN(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, output_size, padding_idx, emb_weights):
    super(MyRNN, self).__init__()

    self.embeddings = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_size)

  def forward(self, x):
    x = self.embeddings(x)
    x, h_T = self.rnn(x)
    x = x[:, -1, :]
    x = self.fc(x)
    pred = torch.softmax(x, dim=-1)
    return pred

In [15]:
PADDING_SIZE = vocab_dim + 1
net = MyRNN(300, 50, 4, PADDING_SIZE, weights)
net.apply(init_weights)

MyRNN(
  (embeddings): Embedding(8195, 300, padding_idx=8194)
  (rnn): RNN(300, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

In [16]:
from torch import optim

net.train()
loss_fn = nn.CrossEntropyLoss()
op = optim.SGD(net.parameters(), lr=0.1)

In [17]:
from tqdm import tqdm

epochs = 5

for i in range(epochs):
  print("------")
  print(f"Epoch {i}/{epochs}")
  
  for phase in ["train", "val"]:
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    if phase == "train":
      net.train()
    else:
      net.eval()

    for data, label in tqdm(dataloader[phase]):
      op.zero_grad()

      with torch.set_grad_enabled(phase == "train"):
        outputs = net(data)
        loss = loss_fn(outputs, label)
        _, pred = torch.max(outputs, 1)

        if phase == "train":
          loss.backward()
          op.step()

        epoch_loss += loss.item() * data.size(0)
        epoch_acc += torch.sum(pred == label.data)
    
    size = len(dataloader[phase].dataset)
    print(f"train loss: {epoch_loss / size}, acc: {epoch_acc / size}")

------
Epoch 0/5


100%|██████████| 167/167 [01:05<00:00,  2.55it/s]


train loss: 1.2579062035002542, acc: 0.4707038700580597


100%|██████████| 21/21 [00:08<00:00,  2.47it/s]


train loss: 1.2239873473515768, acc: 0.5067365169525146
------
Epoch 1/5


100%|██████████| 167/167 [01:12<00:00,  2.29it/s]


train loss: 1.190047638642123, acc: 0.5436166524887085


100%|██████████| 21/21 [00:08<00:00,  2.43it/s]


train loss: 1.1789815425872803, acc: 0.553892195224762
------
Epoch 2/5


100%|██████████| 167/167 [01:12<00:00,  2.31it/s]


train loss: 1.1660256835683283, acc: 0.569917619228363


100%|██████████| 21/21 [00:09<00:00,  2.29it/s]


train loss: 1.2275004572497157, acc: 0.5194610953330994
------
Epoch 3/5


100%|██████████| 167/167 [01:14<00:00,  2.24it/s]


train loss: 1.1435274648023697, acc: 0.5931299328804016


100%|██████████| 21/21 [00:08<00:00,  2.34it/s]


train loss: 1.1188278248210153, acc: 0.6220059990882874
------
Epoch 4/5


100%|██████████| 167/167 [01:14<00:00,  2.24it/s]


train loss: 1.1214013403607237, acc: 0.6151254177093506


100%|██████████| 21/21 [00:08<00:00,  2.38it/s]

train loss: 1.1148523941725315, acc: 0.6190119981765747



