In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**knock80**

In [3]:
##knock50
import pandas as pd
from sklearn.model_selection import train_test_split

#ファイルを読み込む
data = pd.read_csv('drive/MyDrive/chapter09/newsCorpora.csv', sep = '\t', header = None, names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

#事例（記事）を抽出する
data = data.loc[data['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

#分割する
##shuffle：分割する前dataをランダムにする
train, valid_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=123, stratify=data['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [4]:
from collections import defaultdict
import string

#単語の頻度集計
d = defaultdict(int) #辞書d
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))  #記号をスペースに置換する
for text in train['TITLE']:
  for word in text.translate(table).split():
    d[word]+= 1 #単語の頻度を増やす
d = sorted(d.items(), key=lambda x:x[1], reverse=True) #reverse:降順

#単語ID辞書の作成
word2id = {word: i+ 1 for i, (word, fre) in enumerate(d) if fre> 2}  #出現頻度が2回以上

print(f'ID数: {len(set(word2id.values()))}\n')
for key in list(word2id)[:10]:
    print(f'{key}: {word2id[key]}') #頻度上位10

ID数: 6666

to: 1
s: 2
in: 3
on: 4
UPDATE: 5
as: 6
US: 7
for: 8
of: 9
The: 10


In [5]:
#文章を入力として、その文中の単語を先頭からID化
def tokenizer(text, word2id=word2id, unk=0):
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  return [word2id.get(word, unk) for word in text.translate(table).split()] #単語を辞書word2idから対応するIDに変換

#確認
text = train.iloc[1, train.columns.get_loc('TITLE')]
print(f'テキスト: {text}')
print(f'ID列: {tokenizer(text)}')

テキスト: FOREX-Dollar rises on US rate speculation after Yellen comments
ID列: [55, 59, 161, 4, 7, 234, 3530, 26, 97, 429]


**knock81**

In [6]:
#RNNモデルを構築
import torch
from torch import nn
class RNN (nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx = padding_idx) #単語の埋め込み
    self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity = 'tanh', batch_first = True) #tanh: 双曲線正接関数
    self.fc = nn.Linear(hidden_size, output_size) #隠れ層から出力層への線形変換

  def forward(self,x): #順伝播の計算
    self.batch_size = x.size()[0]
    hidden = self.init_hidden(x.device)
    emb = self.emb(x) #emb.size()=(batch_size, seq_len, emb_size)
    out, hidden = self.rnn(emb, hidden) #out.size()=(batch_size, seq_len, hidden_size)
    out = self.fc(out[:, -1, :]) #最後の出力 #out.size()=(batch_size, output_size)
    return out

  def init_hidden(self, device): #初期隠れ層
    hidden = torch.zeros(1, self.batch_size, self.hidden_size, device=device)
    return hidden

In [7]:
from torch.utils.data import Dataset

class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.X[index]
    inputs = self.tokenizer(text)

    return {
      'inputs': torch.tensor(inputs, dtype=torch.int64),
      'labels': torch.tensor(self.y[index], dtype=torch.int64)
    }

#ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = train['CATEGORY'].map(lambda x: category_dict[x]).values
y_valid = valid['CATEGORY'].map(lambda x: category_dict[x]).values
y_test = test['CATEGORY'].map(lambda x: category_dict[x]).values

#Datasetの作成
dataset_train = CreateDataset(train['TITLE'], y_train, tokenizer)
dataset_valid = CreateDataset(valid['TITLE'], y_valid, tokenizer)
dataset_test = CreateDataset(test['TITLE'], y_test, tokenizer)

print(f'len(Dataset)の出力: {len(dataset_train)}')
print('Dataset[index]の出力:')
for var in dataset_train[1]:
  print(f'  {var}: {dataset_train[1][var]}')

len(Dataset)の出力: 10672
Dataset[index]の出力:
  inputs: tensor([  55,   59,  161,    4,    7,  234, 3530,   26,   97,  429])
  labels: 0


In [8]:
#予測
#パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1  # 辞書のID数+paddingID
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

#rnnモデル
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE)

for i in range(10):
 X = dataset_train[i]['inputs']
 print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.2490, 0.3425, 0.1677, 0.2408]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1951, 0.1784, 0.4065, 0.2201]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3486, 0.3250, 0.1485, 0.1779]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1983, 0.2109, 0.1453, 0.4455]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1949, 0.2472, 0.3302, 0.2276]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3171, 0.2066, 0.1295, 0.3469]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2367, 0.3316, 0.2403, 0.1914]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2151, 0.1901, 0.1921, 0.4027]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3488, 0.1462, 0.2142, 0.2908]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2713, 0.1708, 0.2654, 0.2925]], grad_fn=<SoftmaxBackward0>)


**knock82**

In [9]:
from torch.utils.data import DataLoader
from torch import optim

##損失と正解率の計算
def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in dataloader:
      inputs = data['inputs']
      labels = data['labels']

      outputs = model(inputs) #順伝播

      if criterion != None: #損失関数
        loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1) #accuracy
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return loss / len(dataset), correct / total

In [10]:
#学習を実行し、損失・正解率を返す
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
  #dataloaderを作る
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

  #スケジューラの設定
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

  #学習
  for epoch in range(num_epochs):

    model.train() #訓練モードに設定
    for data in dataloader_train:
      optimizer.zero_grad() #勾配をゼロで初期化

      #順伝播+誤差逆伝播+重み更新
      inputs = data['inputs']
      labels = data['labels']
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    model.eval() #評価モードに設定

    #損失と正解率
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)

    #チェックポイントの保存
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')

    if epoch > 10: #stop
      break

    scheduler.step() #スケジューラを1step進める


In [11]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
LEARNING_RATE = 1e-3
BATCH_SIZE = 1
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE) #model
criterion = nn.CrossEntropyLoss() #損失関数
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS)

epoch: 1, loss_train: 1.1025, accuracy_train: 0.5272, loss_valid: 1.1351, accuracy_valid: 0.5007
epoch: 2, loss_train: 1.0191, accuracy_train: 0.5878, loss_valid: 1.0795, accuracy_valid: 0.5555
epoch: 3, loss_train: 0.9046, accuracy_train: 0.6596, loss_valid: 0.9852, accuracy_valid: 0.6259
epoch: 4, loss_train: 0.7693, accuracy_train: 0.7277, loss_valid: 0.8776, accuracy_valid: 0.6897
epoch: 5, loss_train: 0.6969, accuracy_train: 0.7510, loss_valid: 0.8413, accuracy_valid: 0.7069
epoch: 6, loss_train: 0.6138, accuracy_train: 0.7819, loss_valid: 0.7752, accuracy_valid: 0.7354
epoch: 7, loss_train: 0.5731, accuracy_train: 0.7955, loss_valid: 0.7469, accuracy_valid: 0.7444
epoch: 8, loss_train: 0.5302, accuracy_train: 0.8087, loss_valid: 0.7133, accuracy_valid: 0.7571
epoch: 9, loss_train: 0.5167, accuracy_train: 0.8136, loss_valid: 0.7076, accuracy_valid: 0.7534
epoch: 10, loss_train: 0.5100, accuracy_train: 0.8140, loss_valid: 0.7034, accuracy_valid: 0.7549


**knock83**

In [12]:
##系列の長さに基づいて自動的にパディングが行われます
class Padsequence():
  def __init__(self, padding_idx):
    self.padding_idx = padding_idx

  def __call__(self, batch): #ミニバッチを取り出す
    sorted_batch = sorted(batch, key=lambda x: x['inputs'].shape[0], reverse=True)
    sequences = [x['inputs'] for x in sorted_batch]
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
    labels = torch.LongTensor([x['labels'] for x in sorted_batch])

    return {'inputs': sequences_padded, 'labels': labels}

In [30]:
##損失と正解率の計算
def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in dataloader:
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)

      outputs = model(inputs) #順伝播

      if criterion != None: #損失関数
        loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1) #accuracy
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return loss / len(dataset), correct / total

In [31]:
#学習を実行し、損失・正解率を返す GPUを使う
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
  model.to(device)

  #dataloaderを作る
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

  #スケジューラの設定
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

  #学習
  for epoch in range(num_epochs):
    model.train() #訓練モードに設定
    for data in dataloader_train:
      optimizer.zero_grad() #勾配をゼロで初期化

      #順伝播+誤差逆伝播+重み更新
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    model.eval() #評価モードに設定

    #損失と正解率
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)

    #チェックポイントの保存
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')

    if epoch > 10: #stop
      break

    scheduler.step() #スケジューラを1step進める


In [32]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE) #model
criterion = nn.CrossEntropyLoss() #損失関数
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.2501, accuracy_train: 0.4097, loss_valid: 1.2362, accuracy_valid: 0.4325
epoch: 2, loss_train: 1.1827, accuracy_train: 0.4770, loss_valid: 1.1750, accuracy_valid: 0.4925
epoch: 3, loss_train: 1.1303, accuracy_train: 0.5284, loss_valid: 1.1455, accuracy_valid: 0.5135
epoch: 4, loss_train: 1.0680, accuracy_train: 0.5934, loss_valid: 1.1185, accuracy_valid: 0.5525
epoch: 5, loss_train: 0.9469, accuracy_train: 0.6845, loss_valid: 0.9872, accuracy_valid: 0.6492
epoch: 6, loss_train: 0.9487, accuracy_train: 0.6744, loss_valid: 1.0336, accuracy_valid: 0.6057
epoch: 7, loss_train: 0.8413, accuracy_train: 0.7220, loss_valid: 0.9294, accuracy_valid: 0.6657
epoch: 8, loss_train: 0.7838, accuracy_train: 0.7451, loss_valid: 0.8814, accuracy_valid: 0.6897
epoch: 9, loss_train: 0.7677, accuracy_train: 0.7489, loss_valid: 0.8727, accuracy_valid: 0.6934
epoch: 10, loss_train: 0.7742, accuracy_train: 0.7441, loss_valid: 0.8822, accuracy_valid: 0.6852


**knock84**

In [35]:
import numpy as np
from gensim.models import KeyedVectors

#学習済みモデルのロード
model = KeyedVectors.load_word2vec_format('drive/MyDrive/chapter09/GoogleNews-vectors-negative300.bin.gz', binary=True)

#学習済み単語ベクトルの取得
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300 #単語ベクトルの次元数
weights = np.zeros((VOCAB_SIZE, EMB_SIZE)) #学習済み単語ベクトルを格納する
words_in_pretrained = 0
for i, word in enumerate(word2id.keys()):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1 #学習済み単語ベクトルの単語の数
  except KeyError:
    weights[i] = np.random.normal(scale=0.4, size=(EMB_SIZE,)) #ランダムな値で行を初期化する
weights = torch.from_numpy(weights.astype((np.float32))) #numpy配列からtorch.Tensorに変換

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')
print(weights.size())

学習済みベクトル利用単語数: 6514 / 6667
torch.Size([6667, 300])


In [36]:
class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False):
    super().__init__()
    self.hidden_size = hidden_size #隠れ状態の次元数
    self.num_layers = num_layers
    self.num_directions = bidirectional + 1  #単方向：1、双方向：2
    if emb_weights != None:  #emb_weights: 埋め込み層の重み
      self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    else:
      self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    self.rnn = nn.RNN(emb_size, hidden_size, num_layers, nonlinearity='tanh', bidirectional=bidirectional, batch_first=True)
    self.fc = nn.Linear(hidden_size * self.num_directions, output_size) #線形

  def forward(self, x): #順方向の計算が行う
    self.batch_size = x.size()[0]
    hidden = self.init_hidden(x.device)  #h0のゼロベクトルを作成
    emb = self.emb(x)
    #emb.size() = (batch_size, seq_len, emb_size)
    out, hidden = self.rnn(emb, hidden)
    #out.size() = (batch_size, seq_len, hidden_size * num_directions)
    out = self.fc(out[:, -1, :])
    #out.size() = (batch_size, output_size)
    return out

  def init_hidden(self, device): #RNNの初期隠れ状態をゼロベクトルで初期化
    hidden = torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size, device=device)
    return hidden

In [37]:
# パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 1
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, emb_weights=weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

# モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.2088, accuracy_train: 0.4181, loss_valid: 1.2456, accuracy_valid: 0.4100
epoch: 2, loss_train: 1.1596, accuracy_train: 0.5013, loss_valid: 1.1949, accuracy_valid: 0.4805
epoch: 3, loss_train: 1.1097, accuracy_train: 0.5735, loss_valid: 1.1500, accuracy_valid: 0.5487
epoch: 4, loss_train: 1.1729, accuracy_train: 0.5436, loss_valid: 1.2139, accuracy_valid: 0.5315
epoch: 5, loss_train: 1.1181, accuracy_train: 0.5813, loss_valid: 1.1750, accuracy_valid: 0.5600
epoch: 6, loss_train: 1.0581, accuracy_train: 0.6095, loss_valid: 1.1249, accuracy_valid: 0.5795
epoch: 7, loss_train: 0.9887, accuracy_train: 0.6362, loss_valid: 1.0575, accuracy_valid: 0.6079
epoch: 8, loss_train: 0.9648, accuracy_train: 0.6453, loss_valid: 1.0312, accuracy_valid: 0.6012
epoch: 9, loss_train: 0.9645, accuracy_train: 0.6436, loss_valid: 1.0309, accuracy_valid: 0.6019
epoch: 10, loss_train: 0.9704, accuracy_train: 0.6422, loss_valid: 1.0357, accuracy_valid: 0.6019


**knock85**

In [38]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 2
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, emb_weights=weights, bidirectional=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.1617, accuracy_train: 0.4253, loss_valid: 1.1683, accuracy_valid: 0.4183
epoch: 2, loss_train: 1.1446, accuracy_train: 0.4884, loss_valid: 1.1841, accuracy_valid: 0.4610
epoch: 3, loss_train: 1.0685, accuracy_train: 0.5706, loss_valid: 1.1294, accuracy_valid: 0.5315
epoch: 4, loss_train: 1.0302, accuracy_train: 0.5933, loss_valid: 1.0909, accuracy_valid: 0.5600
epoch: 5, loss_train: 0.9864, accuracy_train: 0.6228, loss_valid: 1.0532, accuracy_valid: 0.5772
epoch: 6, loss_train: 0.9340, accuracy_train: 0.6477, loss_valid: 0.9913, accuracy_valid: 0.6117
epoch: 7, loss_train: 0.9235, accuracy_train: 0.6512, loss_valid: 0.9841, accuracy_valid: 0.6162
epoch: 8, loss_train: 0.9150, accuracy_train: 0.6554, loss_valid: 0.9756, accuracy_valid: 0.6252
epoch: 9, loss_train: 0.9047, accuracy_train: 0.6528, loss_valid: 0.9669, accuracy_valid: 0.6222
epoch: 10, loss_train: 0.9071, accuracy_train: 0.6501, loss_valid: 0.9680, accuracy_valid: 0.6222
