In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**knock80**

In [None]:
##knock50
import pandas as pd
from sklearn.model_selection import train_test_split

#ファイルを読み込む
data = pd.read_csv('drive/MyDrive/chapter09/newsCorpora.csv', sep = '\t', header = None, names = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

#事例（記事）を抽出する
data = data.loc[data['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

#分割する
##shuffle：分割する前dataをランダムにする
train, valid_test = train_test_split(data, test_size=0.2, shuffle=True, random_state=123, stratify=data['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
from collections import defaultdict
import string

#単語の頻度集計
d = defaultdict(int) #辞書d
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))  #記号をスペースに置換する
for text in train['TITLE']:
  for word in text.translate(table).split():
    d[word]+= 1 #単語の頻度を増やす
d = sorted(d.items(), key=lambda x:x[1], reverse=True) #reverse:降順

#単語ID辞書の作成
word2id = {word: i+ 1 for i, (word, fre) in enumerate(d) if fre> 2}  #出現頻度が2回以上

print(f'ID数: {len(set(word2id.values()))}\n')
for key in list(word2id)[:10]:
    print(f'{key}: {word2id[key]}') #頻度上位10

ID数: 6666

to: 1
s: 2
in: 3
on: 4
UPDATE: 5
as: 6
US: 7
for: 8
of: 9
The: 10


In [None]:
#文章を入力として、その文中の単語を先頭からID化
def tokenizer(text, word2id=word2id, unk=0):
  table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  return [word2id.get(word, unk) for word in text.translate(table).split()] #単語を辞書word2idから対応するIDに変換

#確認
text = train.iloc[1, train.columns.get_loc('TITLE')]
print(f'テキスト: {text}')
print(f'ID列: {tokenizer(text)}')

テキスト: FOREX-Dollar rises on US rate speculation after Yellen comments
ID列: [55, 59, 161, 4, 7, 234, 3530, 26, 97, 429]


**knock81**

In [None]:
#RNNモデルを構築
import torch
from torch import nn
class RNN (nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.emb = nn.Embedding(vocab_size, emb_size, padding_idx = padding_idx) #単語の埋め込み
    self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity = 'tanh', batch_first = True) #tanh: 双曲線正接関数
    self.fc = nn.Linear(hidden_size, output_size) #隠れ層から出力層への線形変換

  def forward(self,x): #順伝播の計算
    self.batch_size = x.size()[0]
    hidden = self.init_hidden(x.device)
    emb = self.emb(x) #emb.size()=(batch_size, seq_len, emb_size)
    out, hidden = self.rnn(emb, hidden) #out.size()=(batch_size, seq_len, hidden_size)
    out = self.fc(out[:, -1, :]) #最後の出力 #out.size()=(batch_size, output_size)
    return out

  def init_hidden(self, device): #初期隠れ層
    hidden = torch.zeros(1, self.batch_size, self.hidden_size, device=device)
    return hidden

In [None]:
from torch.utils.data import Dataset

class CreateDataset(Dataset):
  def __init__(self, X, y, tokenizer):
    self.X = X
    self.y = y
    self.tokenizer = tokenizer

  def __len__(self):  # len(Dataset)で返す値を指定
    return len(self.y)

  def __getitem__(self, index):  # Dataset[index]で返す値を指定
    text = self.X[index]
    inputs = self.tokenizer(text)

    return {
      'inputs': torch.tensor(inputs, dtype=torch.int64),
      'labels': torch.tensor(self.y[index], dtype=torch.int64)
    }

#ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = train['CATEGORY'].map(lambda x: category_dict[x]).values
y_valid = valid['CATEGORY'].map(lambda x: category_dict[x]).values
y_test = test['CATEGORY'].map(lambda x: category_dict[x]).values

#Datasetの作成
dataset_train = CreateDataset(train['TITLE'], y_train, tokenizer)
dataset_valid = CreateDataset(valid['TITLE'], y_valid, tokenizer)
dataset_test = CreateDataset(test['TITLE'], y_test, tokenizer)

print(f'len(Dataset)の出力: {len(dataset_train)}')
print('Dataset[index]の出力:')
for var in dataset_train[1]:
  print(f'  {var}: {dataset_train[1][var]}')

len(Dataset)の出力: 10672
Dataset[index]の出力:
  inputs: tensor([  55,   59,  161,    4,    7,  234, 3530,   26,   97,  429])
  labels: 0


In [None]:
#予測
#パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1  # 辞書のID数+paddingID
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

#rnnモデル
model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE)

for i in range(10):
 X = dataset_train[i]['inputs']
 print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.2504, 0.3443, 0.2545, 0.1507]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2128, 0.2875, 0.1774, 0.3223]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1532, 0.1883, 0.1415, 0.5170]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2132, 0.2857, 0.1356, 0.3656]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2459, 0.2095, 0.1260, 0.4186]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2274, 0.1552, 0.2042, 0.4132]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2923, 0.2029, 0.2421, 0.2628]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2383, 0.1002, 0.2612, 0.4003]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1945, 0.3168, 0.1103, 0.3784]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2416, 0.4049, 0.1646, 0.1889]], grad_fn=<SoftmaxBackward0>)


**knock82**

In [None]:
from torch.utils.data import DataLoader
from torch import optim

##損失と正解率の計算
def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in dataloader:
      inputs = data['inputs']
      labels = data['labels']

      outputs = model(inputs) #順伝播

      if criterion != None: #損失関数
        loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1) #accuracy
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return loss / len(dataset), correct / total

In [None]:
#学習を実行し、損失・正解率を返す
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
  #dataloaderを作る
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

  #スケジューラの設定
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

  #学習
  for epoch in range(num_epochs):

    model.train() #訓練モードに設定
    for data in dataloader_train:
      optimizer.zero_grad() #勾配をゼロで初期化

      #順伝播+誤差逆伝播+重み更新
      inputs = data['inputs']
      labels = data['labels']
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    model.eval() #評価モードに設定

    #損失と正解率
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)

    #チェックポイントの保存
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')

    if epoch > 10: #stop
      break

    scheduler.step() #スケジューラを1step進める


In [None]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
LEARNING_RATE = 1e-3
BATCH_SIZE = 1
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE) #model
criterion = nn.CrossEntropyLoss() #損失関数
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS)

epoch: 1, loss_train: 1.1079, accuracy_train: 0.5134, loss_valid: 1.1316, accuracy_valid: 0.4955
epoch: 2, loss_train: 1.0210, accuracy_train: 0.5772, loss_valid: 1.0735, accuracy_valid: 0.5397
epoch: 3, loss_train: 0.8852, accuracy_train: 0.6637, loss_valid: 0.9698, accuracy_valid: 0.6327
epoch: 4, loss_train: 0.7280, accuracy_train: 0.7429, loss_valid: 0.8479, accuracy_valid: 0.6964
epoch: 5, loss_train: 0.6422, accuracy_train: 0.7724, loss_valid: 0.7880, accuracy_valid: 0.7264
epoch: 6, loss_train: 0.5729, accuracy_train: 0.7943, loss_valid: 0.7443, accuracy_valid: 0.7376
epoch: 7, loss_train: 0.5203, accuracy_train: 0.8107, loss_valid: 0.7081, accuracy_valid: 0.7549
epoch: 8, loss_train: 0.4900, accuracy_train: 0.8216, loss_valid: 0.6922, accuracy_valid: 0.7511
epoch: 9, loss_train: 0.4744, accuracy_train: 0.8269, loss_valid: 0.6934, accuracy_valid: 0.7474
epoch: 10, loss_train: 0.4703, accuracy_train: 0.8287, loss_valid: 0.6946, accuracy_valid: 0.7496


**knock83**

In [None]:
##系列の長さに基づいて自動的にパディングが行われます
class Padsequence():
  def __init__(self, padding_idx):
    self.padding_idx = padding_idx

  def __call__(self, batch): #ミニバッチを取り出す
    sorted_batch = sorted(batch, key=lambda x: x['inputs'].shape[0], reverse=True)
    sequences = [x['inputs'] for x in sorted_batch]
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
    labels = torch.LongTensor([x['labels'] for x in sorted_batch])

    return {'inputs': sequences_padded, 'labels': labels}

In [None]:
##損失と正解率の計算
def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
  dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
  loss = 0.0
  total = 0
  correct = 0
  with torch.no_grad():
    for data in dataloader:
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)

      outputs = model(inputs) #順伝播

      if criterion != None: #損失関数
        loss += criterion(outputs, labels).item()

      pred = torch.argmax(outputs, dim=-1) #accuracy
      total += len(inputs)
      correct += (pred == labels).sum().item()

  return loss / len(dataset), correct / total

In [None]:
#学習を実行し、損失・正解率を返す GPUを使う
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
  model.to(device)

  #dataloaderを作る
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
  dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

  #スケジューラの設定
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

  #学習
  for epoch in range(num_epochs):
    model.train() #訓練モードに設定
    for data in dataloader_train:
      optimizer.zero_grad() #勾配をゼロで初期化

      #順伝播+誤差逆伝播+重み更新
      inputs = data['inputs'].to(device)
      labels = data['labels'].to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

    model.eval() #評価モードに設定

    #損失と正解率
    loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
    loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)

    #チェックポイントの保存
    torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')
    print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}')

    if epoch > 10: #stop
      break

    scheduler.step() #スケジューラを1step進める


In [None]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE) #model
criterion = nn.CrossEntropyLoss() #損失関数
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.2721, accuracy_train: 0.3936, loss_valid: 1.2799, accuracy_valid: 0.4003
epoch: 2, loss_train: 1.1861, accuracy_train: 0.4897, loss_valid: 1.1998, accuracy_valid: 0.4775
epoch: 3, loss_train: 1.1241, accuracy_train: 0.5683, loss_valid: 1.1848, accuracy_valid: 0.5427
epoch: 4, loss_train: 1.1464, accuracy_train: 0.5397, loss_valid: 1.2052, accuracy_valid: 0.5187
epoch: 5, loss_train: 1.1896, accuracy_train: 0.4621, loss_valid: 1.2018, accuracy_valid: 0.4625
epoch: 6, loss_train: 1.1761, accuracy_train: 0.4925, loss_valid: 1.2019, accuracy_valid: 0.4820
epoch: 7, loss_train: 1.1471, accuracy_train: 0.5112, loss_valid: 1.1795, accuracy_valid: 0.4993
epoch: 8, loss_train: 1.2044, accuracy_train: 0.4640, loss_valid: 1.2183, accuracy_valid: 0.4595
epoch: 9, loss_train: 1.2245, accuracy_train: 0.4603, loss_valid: 1.2389, accuracy_valid: 0.4513
epoch: 10, loss_train: 1.2272, accuracy_train: 0.4608, loss_valid: 1.2421, accuracy_valid: 0.4505


**knock84**

In [None]:
import numpy as np
from gensim.models import KeyedVectors

#学習済みモデルのロード
model = KeyedVectors.load_word2vec_format('drive/MyDrive/chapter09/GoogleNews-vectors-negative300.bin.gz', binary=True)

#学習済み単語ベクトルの取得
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300 #単語ベクトルの次元数
weights = np.zeros((VOCAB_SIZE, EMB_SIZE)) #学習済み単語ベクトルを格納する
words_in_pretrained = 0
for i, word in enumerate(word2id.keys()):
  try:
    weights[i] = model[word]
    words_in_pretrained += 1 #学習済み単語ベクトルの単語の数
  except KeyError:
    weights[i] = np.random.normal(scale=0.4, size=(EMB_SIZE,)) #ランダムな値で行を初期化する
weights = torch.from_numpy(weights.astype((np.float32))) #numpy配列からtorch.Tensorに変換

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')
print(weights.size())

学習済みベクトル利用単語数: 6514 / 6667
torch.Size([6667, 300])


In [None]:
class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False):
    super().__init__()
    self.hidden_size = hidden_size #隠れ状態の次元数
    self.num_layers = num_layers
    self.num_directions = bidirectional + 1  #単方向：1、双方向：2
    if emb_weights != None:  #emb_weights: 埋め込み層の重み
      self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    else:
      self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    self.rnn = nn.RNN(emb_size, hidden_size, num_layers, nonlinearity='tanh', bidirectional=bidirectional, batch_first=True)
    self.fc = nn.Linear(hidden_size * self.num_directions, output_size) #線形

  def forward(self, x): #順方向の計算が行う
    self.batch_size = x.size()[0]
    hidden = self.init_hidden(x.device)  #h0のゼロベクトルを作成
    emb = self.emb(x)
    #emb.size() = (batch_size, seq_len, emb_size)
    out, hidden = self.rnn(emb, hidden)
    #out.size() = (batch_size, seq_len, hidden_size * num_directions)
    out = self.fc(out[:, -1, :])
    #out.size() = (batch_size, output_size)
    return out

  def init_hidden(self, device): #RNNの初期隠れ状態をゼロベクトルで初期化
    hidden = torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size, device=device)
    return hidden

In [None]:
# パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 1
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, emb_weights=weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

# モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.1695, accuracy_train: 0.4620, loss_valid: 1.1834, accuracy_valid: 0.4708
epoch: 2, loss_train: 1.1678, accuracy_train: 0.4447, loss_valid: 1.1614, accuracy_valid: 0.4460
epoch: 3, loss_train: 1.1000, accuracy_train: 0.5742, loss_valid: 1.1463, accuracy_valid: 0.5412
epoch: 4, loss_train: 1.0367, accuracy_train: 0.6195, loss_valid: 1.0948, accuracy_valid: 0.5787
epoch: 5, loss_train: 1.0422, accuracy_train: 0.6129, loss_valid: 1.1120, accuracy_valid: 0.5667
epoch: 6, loss_train: 1.0449, accuracy_train: 0.6012, loss_valid: 1.1170, accuracy_valid: 0.5600
epoch: 7, loss_train: 0.9810, accuracy_train: 0.6391, loss_valid: 1.0552, accuracy_valid: 0.5967
epoch: 8, loss_train: 0.9410, accuracy_train: 0.6585, loss_valid: 1.0109, accuracy_valid: 0.6229
epoch: 9, loss_train: 0.9351, accuracy_train: 0.6594, loss_valid: 1.0050, accuracy_valid: 0.6222
epoch: 10, loss_train: 0.9392, accuracy_train: 0.6575, loss_valid: 1.0101, accuracy_valid: 0.6192


**knock85**

In [None]:
#parameters
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50
NUM_LAYERS = 2
LEARNING_RATE = 5e-2
BATCH_SIZE = 32
NUM_EPOCHS = 10

model = RNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, emb_weights=weights, bidirectional=True)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

#モデルの学習
train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.1713, accuracy_train: 0.4545, loss_valid: 1.1871, accuracy_valid: 0.4513
epoch: 2, loss_train: 1.1170, accuracy_train: 0.5411, loss_valid: 1.1579, accuracy_valid: 0.5052
epoch: 3, loss_train: 1.0565, accuracy_train: 0.5986, loss_valid: 1.1092, accuracy_valid: 0.5562
epoch: 4, loss_train: 1.0738, accuracy_train: 0.5926, loss_valid: 1.1356, accuracy_valid: 0.5502
epoch: 5, loss_train: 1.0512, accuracy_train: 0.6048, loss_valid: 1.1116, accuracy_valid: 0.5637
epoch: 6, loss_train: 0.9888, accuracy_train: 0.6377, loss_valid: 1.0409, accuracy_valid: 0.6049
epoch: 7, loss_train: 0.9656, accuracy_train: 0.6466, loss_valid: 1.0182, accuracy_valid: 0.6214
epoch: 8, loss_train: 0.9566, accuracy_train: 0.6483, loss_valid: 1.0159, accuracy_valid: 0.6177
epoch: 9, loss_train: 0.9585, accuracy_train: 0.6470, loss_valid: 1.0256, accuracy_valid: 0.6147
epoch: 10, loss_train: 0.9631, accuracy_train: 0.6448, loss_valid: 1.0331, accuracy_valid: 0.6132


**knock.86**

In [None]:
#テキスト分類モデル
from torch.nn import functional as F

class CNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):
    super().__init__()
    if emb_weights != None:  #指定があれば埋め込み層の重みをemb_weightsで初期化
      self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx) #入力単語の埋め込み
    else:
      self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0)) #1次元の畳み込み、入力のチャネル数：1
    self.drop = nn.Dropout(0.3) #過学習を防ぐ
    self.fc = nn.Linear(out_channels, output_size) #線形変換

  def forward(self, x): #モデルの順伝播
    #x.size() = (batch_size, seq_len)
    emb = self.emb(x).unsqueeze(1)
    #emb.size() = (batch_size, 1, seq_len, emb_size)
    conv = self.conv(emb) #埋め込みベクトルを畳み込み層に通し、畳み込み演算
    #conv.size() = (batch_size, out_channels, seq_len, 1)
    act = F.relu(conv.squeeze(3)) #3番目の次元を1次元に圧縮し、ReLU関数を適用
    #act.size() = (batch_size, out_channels, seq_len)
    max_pool = F.max_pool1d(act, act.size()[2]) # 1次元の最大プーリング
    #max_pool.size() = (batch_size, out_channels, 1) -> seq_len方向に最大値を取得
    out = self.fc(self.drop(max_pool.squeeze(2)))
    #out.size() = (batch_size, output_size)
    return out

In [None]:
#パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1 #単語の種類数
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
OUT_CHANNELS = 100
KERNEL_HEIGHTS = 3 #畳み込みカーネルの高さ
STRIDE = 1
PADDING = 1

#モデルの定義
model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)

#先頭10件の予測値取得
for i in range(10):
  X = dataset_train[i]['inputs']
  print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

tensor([[0.2258, 0.2948, 0.2645, 0.2149]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2236, 0.2629, 0.2687, 0.2448]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2170, 0.2726, 0.2828, 0.2276]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2178, 0.3150, 0.2420, 0.2252]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2385, 0.2681, 0.2478, 0.2456]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2145, 0.3273, 0.2680, 0.1902]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2311, 0.2439, 0.3267, 0.1983]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2513, 0.2635, 0.2694, 0.2159]], grad_fn=<SoftmaxBackward0>)
tensor([[0.2373, 0.2636, 0.2647, 0.2344]], grad_fn=<SoftmaxBackward0>)
tensor([[0.1971, 0.2009, 0.3086, 0.2935]], grad_fn=<SoftmaxBackward0>)


**knock87**

In [None]:
#パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
OUT_CHANNELS = 100
KERNEL_HEIGHTS = 3
STRIDE = 1
PADDING = 1
LEARNING_RATE = 5e-2
BATCH_SIZE = 64
NUM_EPOCHS = 10

#モデルの定義
model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=weights)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
device = torch.device('cuda')

#モデルの学習
log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

epoch: 1, loss_train: 1.0855, accuracy_train: 0.5352, loss_valid: 1.0886, accuracy_valid: 0.5420
epoch: 2, loss_train: 1.0082, accuracy_train: 0.6297, loss_valid: 1.0237, accuracy_valid: 0.6079
epoch: 3, loss_train: 0.9292, accuracy_train: 0.6719, loss_valid: 0.9616, accuracy_valid: 0.6484
epoch: 4, loss_train: 0.8668, accuracy_train: 0.6951, loss_valid: 0.9146, accuracy_valid: 0.6717
epoch: 5, loss_train: 0.8222, accuracy_train: 0.7127, loss_valid: 0.8802, accuracy_valid: 0.6852
epoch: 6, loss_train: 0.7891, accuracy_train: 0.7237, loss_valid: 0.8601, accuracy_valid: 0.6934
epoch: 7, loss_train: 0.7674, accuracy_train: 0.7319, loss_valid: 0.8448, accuracy_valid: 0.7039
epoch: 8, loss_train: 0.7541, accuracy_train: 0.7351, loss_valid: 0.8367, accuracy_valid: 0.7016
epoch: 9, loss_train: 0.7479, accuracy_train: 0.7369, loss_valid: 0.8326, accuracy_valid: 0.7024
epoch: 10, loss_train: 0.7463, accuracy_train: 0.7369, loss_valid: 0.8316, accuracy_valid: 0.7039


**knock88**

In [None]:
#テキスト分類モデル　複数の畳み込み層
from torch.nn import functional as F

class textCNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, conv_params, drop_rate, emb_weights=None):
    super().__init__()
    if emb_weights != None:  #指定があれば埋め込み層の重みをemb_weightsで初期化
      self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx) #入力単語の埋め込み
    else:
      self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
    self.convs = nn.ModuleList([nn.Conv2d(1, out_channels, (kernel_height, emb_size), padding=(padding, 0)) for kernel_height, padding in conv_params])
    self.drop = nn.Dropout(drop_rate) #過学習を防ぐ
    self.fc = nn.Linear(len(conv_params) * out_channels, output_size) #線形変換

  def forward(self, x): #モデルの順伝播
    emb = self.emb(x).unsqueeze(1)
    conv = [F.relu(conv(emb)).squeeze(3) for i, conv in enumerate(self.convs)] #埋め込みベクトルを畳み込み層に通し、畳み込み演算
    max_pool = [F.max_pool1d(i, i.size(2)) for i in conv] # 1次元の最大プーリング
    max_pool_cat = torch.cat(max_pool, 1)
    out = self.fc(self.drop(max_pool_cat.squeeze(2)))
    return out

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/390.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optu

In [None]:
import optuna

def objective(trial):
  #チューニング対象パラメータのセット
  emb_size = int(trial.suggest_discrete_uniform('emb_size', 100, 400, 100))
  out_channels = int(trial.suggest_discrete_uniform('out_channels', 50, 200, 50))
  drop_rate = trial.suggest_discrete_uniform('drop_rate', 0.0, 0.5, 0.1)
  learning_rate = trial.suggest_loguniform('learning_rate', 5e-4, 5e-2)
  momentum = trial.suggest_discrete_uniform('momentum', 0.5, 0.9, 0.1)
  batch_size = int(trial.suggest_discrete_uniform('batch_size', 16, 128, 16))

  #固定パラメータの設定
  VOCAB_SIZE = len(set(word2id.values())) + 1
  PADDING_IDX = len(set(word2id.values()))
  OUTPUT_SIZE = 4
  CONV_PARAMS = [[2, 0], [3, 1], [4, 2]]
  NUM_EPOCHS = 30

  # モデルの定義
  model = textCNN(VOCAB_SIZE, emb_size, PADDING_IDX, OUTPUT_SIZE, out_channels, CONV_PARAMS, drop_rate, emb_weights=weights)

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)
  device = torch.cuda.set_device(0)

  log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)
  loss_valid, _ = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)

  return loss_valid

In [None]:
#最適化
study = optuna.create_study()
study.optimize(objective, timeout=7200)

#結果の表示
print('Best trial:')
trial = study.best_trial
print('  Value: {:.3f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
  print('    {}: {}'.format(key, value))

In [None]:
# パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = int(trial.params['emb_size'])
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
OUT_CHANNELS = int(trial.params['out_channels'])
CONV_PARAMS = [[2, 0], [3, 1], [4, 2]]
DROP_RATE = trial.params['drop_rate']
LEARNING_RATE = trial.params['learning_rate']
BATCH_SIZE = int(trial.params['batch_size'])
NUM_EPOCHS = 30

#モデルの定義
model = textCNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, CONV_PARAMS, DROP_RATE, emb_weights=weights)
print(model)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)
device = torch.cuda.set_device(0)


log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

**knock89**

In [None]:
#...