# モデルの初期化

In [1]:
import pandas as pd

df = pd.read_csv("./id.csv")

vocab_dim = df["ID"].max()
vocab_dim

8193

In [2]:
from torch import nn

# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)

In [3]:
from knock81 import MyRNN

model = MyRNN(300, 50, vocab_dim, 4)
model.apply(init_weights)

MyRNN(
  (embeddings): Embedding(8193, 300)
  (rnn): RNN(300, 50)
  (fc): Linear(in_features=50, out_features=4, bias=True)
)

# 学習データの作成

## id変換器の定義（from knock80）

In [4]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words

## データセットの作成

In [5]:
import torch
from torch.utils import data

class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
      単語ベクトルの平均をまとめたテンソル
  y : テンソル
      カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
      学習か訓練かを設定する
  """
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]


In [6]:
import pandas as pd

data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

In [7]:
def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

In [8]:
train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

In [9]:
torch.manual_seed(1)

train_dataloader = data.DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=1, shuffle=False)
test_dataloader = data.DataLoader(test_dataset, batch_size=1, shuffle=False)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

In [10]:
from torch import optim

model.train()
loss_fn = nn.CrossEntropyLoss()
op = optim.SGD(model.parameters(), lr=0.1)

In [11]:
from tqdm import tqdm

epochs = 5

for i in range(epochs):
  print("------")
  print(f"Epoch {i}/{epochs}")
  
  for phase in ["train", "val"]:
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    if phase == "train":
      model.train()
    else:
      model.eval()

    for data, label in tqdm(dataloader[phase]):
      op.zero_grad()

      with torch.set_grad_enabled(phase == "train"):
        outputs = model(data)
        loss = loss_fn(outputs, label)
        _, pred = torch.max(outputs, 1)

        if phase == "train":
          loss.backward()
          op.step()

        epoch_loss += loss.item() * data.size(0)
        epoch_acc += torch.sum(pred == label.data)
    
    size = len(dataloader[phase].dataset)
    print(f"train loss: {epoch_loss / size}, acc: {epoch_acc / size}")


------
Epoch 0/5


100%|██████████| 10684/10684 [05:31<00:00, 32.26it/s]


train loss: 1.2876507903263392, acc: 0.40630850195884705


100%|██████████| 1336/1336 [00:35<00:00, 38.01it/s]


train loss: 1.2706913043728132, acc: 0.40793412923812866
------
Epoch 1/5


100%|██████████| 10684/10684 [05:01<00:00, 35.39it/s]


train loss: 1.2698354373282594, acc: 0.41220515966415405


100%|██████████| 1336/1336 [00:33<00:00, 40.03it/s]


train loss: 1.2614200067912746, acc: 0.40793412923812866
------
Epoch 2/5


100%|██████████| 10684/10684 [05:09<00:00, 34.55it/s]


train loss: 1.2834007936564333, acc: 0.40359416604042053


100%|██████████| 1336/1336 [00:35<00:00, 38.09it/s]


train loss: 1.2925025453556798, acc: 0.40793412923812866
------
Epoch 3/5


100%|██████████| 10684/10684 [04:50<00:00, 36.83it/s]


train loss: 1.2666718505911505, acc: 0.41894420981407166


100%|██████████| 1336/1336 [00:29<00:00, 45.01it/s]


train loss: 1.2828105116586486, acc: 0.40793412923812866
------
Epoch 4/5


100%|██████████| 10684/10684 [04:37<00:00, 38.50it/s]


train loss: 1.2665421485186783, acc: 0.40846124291419983


100%|██████████| 1336/1336 [00:29<00:00, 44.68it/s]

train loss: 1.260601889765905, acc: 0.40793412923812866





In [12]:
torch.save(model.state_dict(), "knock82.model.pt")