# 第8章: ニューラルネット

第6章で取り組んだニュース記事のカテゴリ分類を題材として，ニューラルネットワークでカテゴリ分類モデルを実装する．なお，この章ではPyTorch, TensorFlow, Chainerなどの機械学習プラットフォームを活用せよ．

In [102]:
import pandas as pd
import numpy as np
import csv
import re
import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## 70. 単語ベクトルの和による特徴量

In [4]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip

--2021-05-18 00:57:11--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip’


2021-05-18 00:57:14 (12.7 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [13]:
df = pd.read_table("newsCorpora.csv", header=None)
df.set_axis(['ID','TITLE','URL','PUBLISHER','CATEGORY','STORY','HOSTNAME','TIMESTAMP'], axis=1, inplace=True)
df = df.set_index('ID')

extractPublisher = ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]
df = df.loc[df['PUBLISHER'].isin(extractPublisher)]

df = df.sample(frac=1, random_state=42)
train, valid, test = np.split(df, [int(.8*len(df)), int(.9*len(df))])

In [91]:
def cat_convert(df,filename):
  cat_list_num = list(map(lambda x:{'b':0,
                               't':1,
                               'e':2,
                               'm':3}[x], df["CATEGORY"]))
  vec = torch.tensor(cat_list_num)
  torch.save(vec, filename)
  return vec

Ytrain = cat_convert(train, "Ytrain.pt")
Yvalid = cat_convert(valid, "Yvalid.pt")
Ytest = cat_convert(test, "Ytest.pt")

In [47]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [92]:
def preprocess(text):
  text = text.lower()
  text = re.sub(r"[.?!-/:;\"]",'',text)
  text = re.sub(r"[0-9]","",text)
  return text

def title_convert(df, filename):
  titles = df.TITLE
  title_processed = [preprocess(x) for x in titles]
  Xmatrix = []
  for title in title_processed:
      vec_list = [model[word] for word in title.split() if word in model]
      result = sum(vec_list)/len(vec_list)
      Xmatrix.append(result)
  matrix = torch.tensor(Xmatrix)
  torch.save(matrix, filename)
  return matrix

Xtrain = title_convert(train, "Xtrain.pt")
Xvalid = title_convert(valid, "Xvalid.pt")
Xtest = title_convert(test, "Xtest.pt")

## 71. 単層ニューラルネットワークによる予測

In [79]:
class NeuralNetwork(nn.Module):
  def __init__(self, input_size, output_size):
    super().__init__()
    self.fc = nn.Linear(input_size, output_size, bias=False)
  
  def forward(self, x):
    x = self.fc(x)
    return x

In [80]:
nn_model = NeuralNetwork(300, 4)

In [83]:
x1 = Xtrain[0]
y1_hat = nn_model(x1)
y1_hat = F.softmax(y1_hat, dim=-1)

x1_4 = Xtrain[:4]
Y_hat = nn_model(x1_4)
Y_hat = F.softmax(Y_hat,dim=-1)

In [86]:
print("y1: ", y1_hat)
print("Y: ", Y_hat)

y1:  tensor([0.2633, 0.2466, 0.2557, 0.2344], grad_fn=<SoftmaxBackward>)
Y:  tensor([[0.2633, 0.2466, 0.2557, 0.2344],
        [0.2604, 0.2365, 0.2546, 0.2484],
        [0.2483, 0.2531, 0.2474, 0.2512],
        [0.2508, 0.2534, 0.2564, 0.2394]], grad_fn=<SoftmaxBackward>)


## 72. 損失と勾配の計算

In [111]:
loss = nn.CrossEntropyLoss()

print('x1')
# nn_model.zero_grad()
loss_val = loss(nn_model(Xtrain[:1]), Ytrain[:1])
nn_model.zero_grad()
loss_val.backward()
print('loss: ',loss_val.item())
print('gradient: ', nn_model.fc.weight.grad)

print()

print('x1_4')
nn_model.zero_grad()
loss_val = loss(nn_model(Xtrain[:4]), Ytrain[:4])
loss_val.backward()
print('loss: ',loss_val.item())
print('gradient: ', nn_model.fc.weight.grad)

x1
loss:  0.4615009129047394
gradient:  tensor([[ 0.0133,  0.0129, -0.0017,  ...,  0.0031, -0.0183,  0.0122],
        [-0.0024, -0.0023,  0.0003,  ..., -0.0006,  0.0033, -0.0022],
        [-0.0098, -0.0095,  0.0013,  ..., -0.0023,  0.0135, -0.0090],
        [-0.0011, -0.0011,  0.0001,  ..., -0.0003,  0.0015, -0.0010]])

x1_4
loss:  0.45604491233825684
gradient:  tensor([[ 0.0020, -0.0092,  0.0025,  ..., -0.0071, -0.0126,  0.0058],
        [-0.0007, -0.0016, -0.0048,  ..., -0.0058,  0.0038, -0.0056],
        [-0.0011,  0.0109,  0.0026,  ...,  0.0133,  0.0081,  0.0005],
        [-0.0003, -0.0002, -0.0003,  ..., -0.0003,  0.0006, -0.0006]])


## 73. 確率的勾配降下法による学習

In [103]:
learning_rate = 0.01
n_iters = 100

optimizer = optim.SGD(nn_model.parameters(), lr=learning_rate)
dataset = TensorDataset(Xtrain, Ytrain)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

for epoch in range(n_iters):
  total_loss = 0
  for x, y in loader:
    optimizer.zero_grad()
    loss_value = loss(nn_model(x),y)
    loss_value.backward()
    optimizer.step()
    total_loss += loss_value.item()
  print(epoch, "Loss: ", total_loss/len(loader))

0 Loss:  0.7469321696685154
1 Loss:  0.5347062539660854
2 Loss:  0.4720564242519347
3 Loss:  0.4388999477232451
4 Loss:  0.41780008454485124
5 Loss:  0.4028401354253966
6 Loss:  0.3917135834041966
7 Loss:  0.38267476251028226
8 Loss:  0.3753227742265804
9 Loss:  0.36899962025507455
10 Loss:  0.3637679615466244
11 Loss:  0.35915568479385634
12 Loss:  0.3549647993638016
13 Loss:  0.3513511230579701
14 Loss:  0.348077141320929
15 Loss:  0.34505089412233986
16 Loss:  0.3423283442078807
17 Loss:  0.3398545049823668
18 Loss:  0.33747984060287545
19 Loss:  0.3353467882944791
20 Loss:  0.33338885565168175
21 Loss:  0.3314033477682825
22 Loss:  0.32965787524133694
23 Loss:  0.3280970786674602
24 Loss:  0.32649239637683675
25 Loss:  0.32511538716395244
26 Loss:  0.3236670662238755
27 Loss:  0.3222549095174427
28 Loss:  0.3209206235403993
29 Loss:  0.3198561786389379
30 Loss:  0.3187309418407338
31 Loss:  0.31770051930515547
32 Loss:  0.3166330592297352
33 Loss:  0.3155823742898499
34 Loss:  0.31

## 74. 正解率の計測

In [109]:
def acc(loader, model):
  sum_acc = 0
  for x, y in loader: 
    y_pred = nn_model(x)
    sum_acc += y_pred.argmax(1) == y
  return sum_acc/len(loader)

print('train accuracy: ',end='')
dataset = TensorDataset(Xtrain, Ytrain)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
print(acc(loader, model))

print('test accuracy: ',end='')
dataset = TensorDataset(Xtest, Ytest)
loader = DataLoader(dataset, batch_size=1, shuffle=True)
print(acc(loader, model))

train accuracy: tensor([0.9036])
test accuracy: tensor([0.8913])


## 75. 損失と正解率のプロット

## 76. チェックポイント

## 77. ミニバッチ化

## 78. GPU上での学習

## 79. 多層ニューラルネットワーク