In [1]:
import re
import spacy
import pandas as pd
import torch
from torch.utils import data
from torch import nn
from knock81 import MyRNN


df = pd.read_csv("./id.csv")

vocab_dim = df["ID"].max()

nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0


def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words


# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)


class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
      単語ベクトルの平均をまとめたテンソル
  y : テンソル
      カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
      学習か訓練かを設定する
  """
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]


def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1


data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

model = MyRNN(300, 50, vocab_dim, 4)
model.apply(init_weights)


MyRNN(
  (embeddings): Embedding(8193, 300)
  (rnn): RNN(300, 50)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

In [2]:
def collate_fn(batch):
  sequences = [x[0] for x in batch]
  labels = torch.LongTensor([x[1] for x in batch])
  x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
  return x, labels

In [3]:
batch_size = 64

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

# 動作確認
batch_iter = iter(dataloader['train'])
inputs, labels = next(batch_iter)
print(inputs)
print(labels)

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([1, 0, 1, 1, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 2, 0, 2, 2, 2, 3, 1, 0, 0, 2,
        0, 2, 2, 2, 0, 2, 0, 3, 0, 0, 0, 2, 0, 3, 0, 2, 0, 0, 3, 1, 2, 0, 0, 0,
        2, 2, 2, 3, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2])


In [4]:
from torch import optim

model.train()
loss_fn = nn.CrossEntropyLoss()
op = optim.SGD(model.parameters(), lr=0.1)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
  print(torch.cuda.get_device_name())

print("使用デバイス:", device)
model.to(device)

使用デバイス: cpu


MyRNN(
  (embeddings): Embedding(8193, 300)
  (rnn): RNN(300, 50)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

In [7]:
from tqdm import tqdm

epochs = 5

for i in range(epochs):
  print("------")
  print(f"Epoch {i}/{epochs}")
  
  for phase in ["train", "val"]:
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    if phase == "train":
      model.train()
    else:
      model.eval()

    for data, label in tqdm(dataloader[phase]):
      data = data.to(device)
      lebel = label.to(device)
      op.zero_grad()

      with torch.set_grad_enabled(phase == "train"):
        outputs = model(data)
        loss = loss_fn(outputs, label)
        _, pred = torch.max(outputs, 1)

        if phase == "train":
          loss.backward()
          op.step()

        epoch_loss += loss.item() * data.size(0)
        epoch_acc += torch.sum(pred == label.data)
    
    size = len(dataloader[phase].dataset)
    print(f"train loss: {epoch_loss / size}, acc: {epoch_acc / size}")

------
Epoch 0/5


100%|██████████| 167/167 [03:32<00:00,  1.27s/it]


train loss: 1.2674920687912912, acc: 0.4154810905456543


100%|██████████| 21/21 [00:25<00:00,  1.19s/it]


train loss: 1.2667445543997302, acc: 0.40793412923812866
------
Epoch 1/5


100%|██████████| 167/167 [03:27<00:00,  1.24s/it]


train loss: 1.260049956026081, acc: 0.42231374979019165


100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


train loss: 1.2650611165040981, acc: 0.40793412923812866
------
Epoch 2/5


100%|██████████| 167/167 [03:49<00:00,  1.38s/it]


train loss: 1.2604771079485089, acc: 0.41688504815101624


100%|██████████| 21/21 [00:23<00:00,  1.11s/it]


train loss: 1.261752892397121, acc: 0.40793412923812866
------
Epoch 3/5


100%|██████████| 167/167 [03:20<00:00,  1.20s/it]


train loss: 1.260044274996033, acc: 0.41201797127723694


100%|██████████| 21/21 [00:25<00:00,  1.22s/it]


train loss: 1.2629908780137935, acc: 0.40793412923812866
------
Epoch 4/5


100%|██████████| 167/167 [03:32<00:00,  1.27s/it]


train loss: 1.2600767748581698, acc: 0.4166978597640991


100%|██████████| 21/21 [00:24<00:00,  1.14s/it]

train loss: 1.260904187927703, acc: 0.40793412923812866



