In [1]:
import re
import spacy
import pandas as pd
import torch
from torch import nn

df = pd.read_csv("./id.csv", index_col=0)
vocab_dim = df["ID"].max()

nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words


# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)

In [2]:
import os
import numpy as np
from dotenv import load_dotenv
from gensim.models import KeyedVectors

load_dotenv()
FILE_DIR = os.getenv('FILE_DIR')
model = KeyedVectors.load_word2vec_format(f"{FILE_DIR}/GoogleNews-vectors-negative300.bin", binary=True)

VOCAB_SIZE = vocab_dim + 2
EMB_SIZE = 300

weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0

weights.shape

target = df[df["COUNT"] > 1].index.tolist()

for i, word in enumerate(target):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1
  except KeyError:
    weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))
  
weights = torch.from_numpy(weights.astype((np.float32)))

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')

学習済みベクトル利用単語数: 6790 / 8195


In [3]:
from torch.utils import data

class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
    単語ベクトルの平均をまとめたテンソル
  y : テンソル
    カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
    学習か訓練かを設定する
  """
  
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]

def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1
  
def collate_fn(batch):
  sequences = [x[0] for x in batch]
  labels = torch.LongTensor([x[1] for x in batch])
  x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
  return x, labels


data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

batch_size = 128

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

In [4]:
from torch import optim
from torch.nn import functional as F

class MyCNN(nn.Module):
  def __init__(self, vocab_dim, embedding_dim, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights):
    super(MyCNN, self).__init__()

    self.embeddings = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    self.conv = nn.Conv2d(1, out_channels, (kernel_heights, embedding_dim), stride, (padding, 0))
    self.drop = nn.Dropout(0.4)
    self.fc = nn.Linear(out_channels, output_size)

  def forward(self, x):
    emb = self.embeddings(x).unsqueeze(1)
    conv = self.conv(emb)
    act = F.relu(conv.squeeze(3))
    max_pool = F.max_pool1d(act, act.size()[2])
    logits = self.fc(self.drop(max_pool.squeeze(2)))
    pred = torch.softmax(logits, dim=-1)
    return pred
  
PADDING_SIZE = vocab_dim + 1
net = MyCNN(VOCAB_SIZE, 300, PADDING_SIZE, 4, 500, 2, 1, 1, weights)
net.apply(init_weights)

net.train()
loss_fn = nn.CrossEntropyLoss()
op = optim.Adam(net.parameters(), lr=0.0005)

In [5]:
from tqdm import tqdm

epochs = 30

for i in range(epochs):
  print("------")
  print(f"Epoch {i}/{epochs}")
  
  for phase in ["train", "val"]:
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    if phase == "train":
      net.train()
    else:
      net.eval()

    for data, label in tqdm(dataloader[phase]):
      op.zero_grad()

      with torch.set_grad_enabled(phase == "train"):
        outputs = net(data)
        loss = loss_fn(outputs, label)
        _, pred = torch.max(outputs, 1)

        if phase == "train":
          loss.backward()
          op.step()

        epoch_loss += loss.item() * data.size(0)
        epoch_acc += torch.sum(pred == label.data)
    
    size = len(dataloader[phase].dataset)
    print(f"train loss: {epoch_loss / size}, acc: {epoch_acc / size}")

------
Epoch 0/30


100%|██████████| 84/84 [01:28<00:00,  1.05s/it]


train loss: 1.232096990232207, acc: 0.5476413369178772


100%|██████████| 11/11 [00:10<00:00,  1.06it/s]


train loss: 1.1454628513244811, acc: 0.6729041934013367
------
Epoch 1/30


100%|██████████| 84/84 [01:23<00:00,  1.00it/s]


train loss: 1.0989501796291812, acc: 0.6969299912452698


100%|██████████| 11/11 [00:09<00:00,  1.12it/s]


train loss: 1.0631369023979782, acc: 0.7208083868026733
------
Epoch 2/30


100%|██████████| 84/84 [01:31<00:00,  1.08s/it]


train loss: 1.0430060110035035, acc: 0.734369158744812


100%|██████████| 11/11 [00:11<00:00,  1.00s/it]


train loss: 1.0293077073411314, acc: 0.7410179376602173
------
Epoch 3/30


100%|██████████| 84/84 [01:23<00:00,  1.01it/s]


train loss: 1.0110939258673985, acc: 0.7558966875076294


100%|██████████| 11/11 [00:09<00:00,  1.11it/s]


train loss: 1.0143391764806415, acc: 0.7544910311698914
------
Epoch 4/30


100%|██████████| 84/84 [01:19<00:00,  1.06it/s]


train loss: 0.9910702548282327, acc: 0.7715275287628174


100%|██████████| 11/11 [00:09<00:00,  1.14it/s]


train loss: 0.9990244450683365, acc: 0.7604790329933167
------
Epoch 5/30


100%|██████████| 84/84 [01:24<00:00,  1.00s/it]


train loss: 0.9761441630711943, acc: 0.7828528881072998


100%|██████████| 11/11 [00:10<00:00,  1.07it/s]


train loss: 0.9898803027090198, acc: 0.7679640650749207
------
Epoch 6/30


100%|██████████| 84/84 [01:25<00:00,  1.02s/it]


train loss: 0.9669148511808047, acc: 0.7902470827102661


100%|██████████| 11/11 [00:10<00:00,  1.02it/s]


train loss: 0.9837630027068589, acc: 0.7747005820274353
------
Epoch 7/30


100%|██████████| 84/84 [01:35<00:00,  1.13s/it]


train loss: 0.9573679306184875, acc: 0.7978285551071167


100%|██████████| 11/11 [00:12<00:00,  1.11s/it]


train loss: 0.9778951692010114, acc: 0.779940128326416
------
Epoch 8/30


100%|██████████| 84/84 [01:21<00:00,  1.03it/s]


train loss: 0.9498701494581822, acc: 0.8024148344993591


100%|██████████| 11/11 [00:09<00:00,  1.13it/s]


train loss: 0.9739152110979229, acc: 0.783682644367218
------
Epoch 9/30


100%|██████████| 84/84 [01:19<00:00,  1.06it/s]


train loss: 0.9458998877963785, acc: 0.8055035471916199


100%|██████████| 11/11 [00:09<00:00,  1.16it/s]


train loss: 0.9717406309293416, acc: 0.7791916131973267
------
Epoch 10/30


100%|██████████| 84/84 [01:18<00:00,  1.07it/s]


train loss: 0.9400809556971057, acc: 0.8078435063362122


100%|██████████| 11/11 [00:09<00:00,  1.17it/s]


train loss: 0.9674323707997442, acc: 0.783682644367218
------
Epoch 11/30


100%|██████████| 84/84 [01:18<00:00,  1.08it/s]


train loss: 0.9373065810085727, acc: 0.8107450604438782


100%|██████████| 11/11 [00:09<00:00,  1.15it/s]


train loss: 0.9688203309824367, acc: 0.7806886434555054
------
Epoch 12/30


100%|██████████| 84/84 [01:18<00:00,  1.08it/s]


train loss: 0.9344750976616057, acc: 0.8114938139915466


100%|██████████| 11/11 [00:09<00:00,  1.14it/s]


train loss: 0.964791969981736, acc: 0.783682644367218
------
Epoch 13/30


100%|██████████| 84/84 [01:18<00:00,  1.07it/s]


train loss: 0.9326598091948484, acc: 0.8128042221069336


100%|██████████| 11/11 [00:09<00:00,  1.15it/s]


train loss: 0.9628081810688545, acc: 0.7844311594963074
------
Epoch 14/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.9308826456530538, acc: 0.8130849599838257


100%|██████████| 11/11 [00:09<00:00,  1.19it/s]


train loss: 0.9618251255886283, acc: 0.783682644367218
------
Epoch 15/30


100%|██████████| 84/84 [01:16<00:00,  1.09it/s]


train loss: 0.929263728537929, acc: 0.8141145706176758


100%|██████████| 11/11 [00:09<00:00,  1.17it/s]


train loss: 0.9607708643296522, acc: 0.7859281301498413
------
Epoch 16/30


100%|██████████| 84/84 [01:18<00:00,  1.07it/s]


train loss: 0.9278063786159245, acc: 0.8147697448730469


100%|██████████| 11/11 [00:09<00:00,  1.13it/s]


train loss: 0.9600454543878932, acc: 0.7859281301498413
------
Epoch 17/30


100%|██████████| 84/84 [01:20<00:00,  1.04it/s]


train loss: 0.9268802691827058, acc: 0.8148633241653442


100%|██████████| 11/11 [00:09<00:00,  1.14it/s]


train loss: 0.9595906745173974, acc: 0.7844311594963074
------
Epoch 18/30


100%|██████████| 84/84 [01:20<00:00,  1.05it/s]


train loss: 0.9263604206121266, acc: 0.8152377605438232


100%|██████████| 11/11 [00:10<00:00,  1.09it/s]


train loss: 0.960204136942675, acc: 0.7844311594963074
------
Epoch 19/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.9256904291463942, acc: 0.8152377605438232


100%|██████████| 11/11 [00:09<00:00,  1.15it/s]


train loss: 0.9577290165924026, acc: 0.7859281301498413
------
Epoch 20/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.9233667784663722, acc: 0.8159865140914917


100%|██████████| 11/11 [00:09<00:00,  1.17it/s]


train loss: 0.9661362042684041, acc: 0.7881736755371094
------
Epoch 21/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.9144492100035818, acc: 0.8292773962020874


100%|██████████| 11/11 [00:09<00:00,  1.17it/s]


train loss: 0.9534327237905856, acc: 0.80613774061203
------
Epoch 22/30


100%|██████████| 84/84 [01:23<00:00,  1.01it/s]


train loss: 0.8992304649303041, acc: 0.8613815307617188


100%|██████████| 11/11 [00:09<00:00,  1.16it/s]


train loss: 0.9509662129208, acc: 0.8166167736053467
------
Epoch 23/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.8857912787869824, acc: 0.8762635588645935


100%|██████████| 11/11 [00:09<00:00,  1.20it/s]


train loss: 0.9335193940979278, acc: 0.81886225938797
------
Epoch 24/30


100%|██████████| 84/84 [01:28<00:00,  1.05s/it]


train loss: 0.8727286725354882, acc: 0.8900224566459656


100%|██████████| 11/11 [00:11<00:00,  1.07s/it]


train loss: 0.9282348520027663, acc: 0.8293412923812866
------
Epoch 25/30


100%|██████████| 84/84 [01:26<00:00,  1.03s/it]


train loss: 0.8606997380003149, acc: 0.9011605978012085


100%|██████████| 11/11 [00:10<00:00,  1.07it/s]


train loss: 0.9224079433315528, acc: 0.833832323551178
------
Epoch 26/30


100%|██████████| 84/84 [01:23<00:00,  1.01it/s]


train loss: 0.8546328294906488, acc: 0.9054661393165588


100%|██████████| 11/11 [00:10<00:00,  1.10it/s]


train loss: 0.9208123587562652, acc: 0.832335352897644
------
Epoch 27/30


100%|██████████| 84/84 [01:17<00:00,  1.08it/s]


train loss: 0.8478139333848157, acc: 0.9098652005195618


100%|██████████| 11/11 [00:09<00:00,  1.10it/s]


train loss: 0.9134753307182631, acc: 0.8443113565444946
------
Epoch 28/30


100%|██████████| 84/84 [01:26<00:00,  1.03s/it]


train loss: 0.8427537778306659, acc: 0.91407710313797


100%|██████████| 11/11 [00:10<00:00,  1.10it/s]


train loss: 0.9118584016126073, acc: 0.841317355632782
------
Epoch 29/30


100%|██████████| 84/84 [01:27<00:00,  1.04s/it]


train loss: 0.8389262971651327, acc: 0.9155746698379517


100%|██████████| 11/11 [00:09<00:00,  1.14it/s]

train loss: 0.9094903251368128, acc: 0.8368263244628906



