In [1]:
import re
import spacy
import pandas as pd
import torch
from torch import nn

df = pd.read_csv("./id.csv", index_col=0)
vocab_dim = df["ID"].max()

nlp = spacy.load("en_core_web_sm")

def search_id(word: str) -> int:
  try:
    return df.loc[[word]]["ID"].values[0]
  except:
    return 0

def tokenizer(title: str):
  words = []

  code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]')
  text = code_regex.sub('', title.rstrip().lower())

  doc = nlp(text)

  for token in doc:
    id = search_id(token.text)
    words.append(id)

  return words


# 重み初期化関数の定義
def init_weights(m):
  if isinstance(m, nn.RNN):
    for name, param in m.named_parameters():
      if 'weight_ih' in name:
        nn.init.xavier_uniform_(param.data)
      elif 'weight_hh' in name:
        nn.init.orthogonal_(param.data)
      elif 'bias' in name:
        nn.init.constant_(param.data, 0)

In [2]:
import os
import numpy as np
from dotenv import load_dotenv
from gensim.models import KeyedVectors

load_dotenv()
FILE_DIR = os.getenv('FILE_DIR')
model = KeyedVectors.load_word2vec_format(f"{FILE_DIR}/GoogleNews-vectors-negative300.bin", binary=True)

VOCAB_SIZE = vocab_dim + 2
EMB_SIZE = 300

weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0

weights.shape

target = df[df["COUNT"] > 1].index.tolist()

for i, word in enumerate(target):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1
  except KeyError:
    weights[i] = np.random.normal(scale=0.1, size=(EMB_SIZE,))
  
weights = torch.from_numpy(weights.astype((np.float32)))

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')

学習済みベクトル利用単語数: 6790 / 8195


In [3]:
from torch.utils import data

class NewsDataset(data.Dataset):
  """
  newsのDatasetクラス
  
  Attributes
  ----------------------------
  X : データフレーム
    単語ベクトルの平均をまとめたテンソル
  y : テンソル
    カテゴリをラベル化したテンソル
  phase : 'train' or 'val'
    学習か訓練かを設定する
  """
  
  def __init__(self, X, y, phase='train'):
    self.X = X['TITLE']
    self.y = y
    self.phase = phase
  
  def __len__(self):
    """全データサイズを返す"""
    return len(self.y)
  
  def __getitem__(self, idx):
    """idxに対応するテンソル形式のデータとラベルを取得"""
    inputs = torch.tensor(tokenizer(self.X[idx]))
    return inputs, self.y[idx]

def category_to_label(category: str):
  if 'b' in category:
    return 0
  elif 't' in category:
    return 1
  elif 'e' in category:
    return 2
  elif 'm' in category:
    return 3
  else:
    return -1
  
def collate_fn(batch):
  sequences = [x[0] for x in batch]
  labels = torch.LongTensor([x[1] for x in batch])
  x = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)
  return x, labels


data_dir = "../chapter06"
train = pd.read_csv(f"{data_dir}/train.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
valid = pd.read_csv(f"{data_dir}/valid.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])
test = pd.read_csv(f"{data_dir}/test.txt", sep="\t", header=None, names=['CATEGORY', 'TITLE'])

train_Y = torch.tensor(train["CATEGORY"].map(category_to_label).values)
valid_Y = torch.tensor(valid["CATEGORY"].map(category_to_label).values)
test_Y = torch.tensor(test["CATEGORY"].map(category_to_label).values)

train_dataset = NewsDataset(train, train_Y, phase='train')
valid_dataset = NewsDataset(valid, valid_Y, phase='val')
test_dataset = NewsDataset(test, test_Y, phase='val')

batch_size = 64

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

dataloader = {
  "train": train_dataloader,
  "val": valid_dataloader,
  "test": test_dataloader,
}

In [5]:
from torch.nn import functional as F

class MyCNN(nn.Module):
  def __init__(self, vocab_dim, embedding_dim, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights):
    super(MyCNN, self).__init__()

    self.embeddings = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    self.conv = nn.Conv2d(1, out_channels, (kernel_heights, embedding_dim), stride, (padding, 0))
    self.drop = nn.Dropout(0.4)
    self.fc = nn.Linear(out_channels, output_size)

  def forward(self, x):
    emb = self.embeddings(x).unsqueeze(1)
    conv = self.conv(emb)
    act = F.relu(conv.squeeze(3))
    max_pool = F.max_pool1d(act, act.size()[2])
    logits = self.fc(self.drop(max_pool.squeeze(2)))
    pred = torch.softmax(logits, dim=-1)
    return pred
  
PADDING_SIZE = vocab_dim + 1
net = MyCNN(VOCAB_SIZE, 300, PADDING_SIZE, 4, 100, 3, 1, 1, weights)
net.apply(init_weights)
net

MyCNN(
  (embeddings): Embedding(8195, 300, padding_idx=8194)
  (conv): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
  (drop): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)