# 第9章: RNN, CNN

## 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [1]:
!ls ../chapter06/work/

test.feature.npy  train.feature.npy  valid.feature.npy
test.txt	  train.txt	     valid.txt


In [2]:
!head ../chapter06/work/train.txt

Greek 10-year yields rise day after five-year sale	b
Tori Spelling - Tori Spelling in 'crisis mode'?	e
German Stocks Rise After Two-Week Rally as Sky Deutschland Gains	b
Sinéad O'Connor Gets New Look For 'I'm Not Bossy, I'm The Boss'	e
Tom Cruise - Tom Cruise Surprises Fans At Movie Theatre	e
BLOGS OF THE DAY: Brad Pitt returns to World War II	e
Microsoft CEO Said to Unveil Office for IPad on March 27	t
Even Slightly High Blood Pressure Could Raise Stroke Risk	m
US Yield Over Japan Double Year-Ago Level as BOJ Holds Policy	b
Keri Russell - Keri Russell: Andy Serkis 'unbelievable'	e


In [1]:
from logzero import logger
from collections import Counter
from collections import defaultdict
from itertools import islice
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/y_kishinami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# 語彙の作成（学習データに出現する単語に対してID番号を付与）を行う関数
def make_vocab(train_path, vocab_path, min_thr):
    with open(train_path) as file, open(vocab_path, 'w') as vocab_file:
        c = Counter()
        vocab = defaultdict(int)

        for line in file:
            title, category = line.strip().split('\t')
            for word in nltk.word_tokenize(title):
                c[word] += 1

        for i, (key, f) in enumerate(sorted(c.items(), key=lambda x: x[1], reverse=True), start=1):
            if f >= min_thr:
                print(key, i, sep='\t', file=vocab_file)
                
# 語彙の読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id

In [3]:
train_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter06/work/train.txt'
vocab_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter09/work/vocab.txt'

In [183]:
# 語彙の作成（初回のみ実行）
make_vocab(train_path, vocab_path, 2)

In [5]:
!head work/vocab.txt

to	1
...	2
,	3
's	4
'	5
in	6
:	7
on	8
as	9
-	10


In [4]:
# 語彙のロード（単語→idへの変換辞書）
word2id = load_vocab(vocab_path)
print('vocab size: {}'.format(len(word2id) + 1))  # 未知語のid:0を含める

vocab size: 9817


In [5]:
# 単語のリストからid番号列に変換する関数
def words2id(words, word2id):
    return torch.tensor([word2id[word] if word in word2id.keys() else 0 for word in words])

sentence = "Tracy Morgan - Tracy Morgan is 'doing better' after crash"
tokens = nltk.word_tokenize(sentence)
print(tokens)
print(words2id(tokens, word2id))

['Tracy', 'Morgan', '-', 'Tracy', 'Morgan', 'is', "'doing", 'better', "'", 'after', 'crash']
tensor([1557,  379,   10, 1557,  379,   47, 8988, 1906,    5,   30, 1446])


## 81. RNNによる予測
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列xからカテゴリyを予測するモデルとして，次式を実装せよ．

- https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

In [6]:
# 1文の単語列->単語数*語彙数のone-hotベクトル
def title2id(title, vocab):
    ids = words2id(nltk.word_tokenize(title), vocab)
    x = torch.zeros(len(ids),len(vocab)+1)  # 0のために１たす
    for i, word_id in enumerate(ids):
        x[i][word_id] = 1
    return x

# category→カテゴリID
def cat2id(y):
    cate = {'b':0, 't':1, 'e':2, 'm':3}
    return torch.tensor(cate[y], dtype=torch.int64)

# titleのID列とカテゴリのIDを生成するgenerator
def gen_sample(path):
    with open(path) as fi:
        for line in fi:
            title, category = line.strip().split('\t')
            yield words2id(nltk.word_tokenize(title), word2id), cat2id(category)

In [115]:
# RNNクラス
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)  # paddingに0を使うと未知語と同じ扱いになっちゃうので、語彙数＋1のIDをpadding_idx
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size, bias=True)

    def forward(self, x, x_len):
        self.batch_size = x.size()[0]
        hidden = torch.zeros(1, self.batch_size, self.hidden_size)
        emb = self.emb(x)  # emb.size() = (batch_size, seq_len, emb_size)
        packed = nn.utils.rnn.pack_padded_sequence(emb, x_len, batch_first=True)  # packed.size() = (packed_seq, batch_size)
        out, h_n = self.rnn(packed, hidden)  # out.size() = (batch_size, seq_len, hidden_size)
        logit = self.fc(h_n[-1])  # out.size() = (batch_size, output_size)
        #logit = self.fc(torch.cat([h_n[-2], h_n[-1]], dim=1))
        return logit

In [10]:
vocab_size = len(word2id) + 1
emb_size = 300
padding_idx = len(word2id)  # 語彙数idxをpadding idxとして追加
output_size = 4  # カテゴリ数
hidden_size = 50

rnn = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size)
inputs = torch.tensor([[1,4,6,3,5], [1,2,3,4,0], [1,1,0,0,0]])  # 例として適当なID列を入力として与える

print('batch size: ', inputs.size()[0])  # batch_size
print('(batch_size, output_size) : ', rnn(inputs, [5, 4, 3]).size())  # [batch_size, output_size]

# 訓練データの10事例について結果をみてみる
for x,y in islice(gen_sample(train_path), 10):
    x_lens = [len(x)]
    print('sequence length:', x_lens)
    print(F.softmax(rnn(x.unsqueeze(0), x_lens), dim=-1))

batch size:  3
(batch_size, output_size) :  torch.Size([3, 4])
sequence length: [8]
tensor([[0.2681, 0.1576, 0.1917, 0.3826]], grad_fn=<SoftmaxBackward>)
sequence length: [10]
tensor([[0.3945, 0.3003, 0.1291, 0.1760]], grad_fn=<SoftmaxBackward>)
sequence length: [10]
tensor([[0.2202, 0.1835, 0.3774, 0.2189]], grad_fn=<SoftmaxBackward>)
sequence length: [17]
tensor([[0.3192, 0.2363, 0.2576, 0.1870]], grad_fn=<SoftmaxBackward>)
sequence length: [10]
tensor([[0.2219, 0.3218, 0.2654, 0.1909]], grad_fn=<SoftmaxBackward>)
sequence length: [12]
tensor([[0.1577, 0.2620, 0.4155, 0.1648]], grad_fn=<SoftmaxBackward>)
sequence length: [11]
tensor([[0.2623, 0.2180, 0.2142, 0.3055]], grad_fn=<SoftmaxBackward>)
sequence length: [9]
tensor([[0.2295, 0.2533, 0.4168, 0.1005]], grad_fn=<SoftmaxBackward>)
sequence length: [11]
tensor([[0.2379, 0.2031, 0.4576, 0.1014]], grad_fn=<SoftmaxBackward>)
sequence length: [10]
tensor([[0.2685, 0.2758, 0.2877, 0.1681]], grad_fn=<SoftmaxBackward>)


## 82. 確率的勾配降下法による学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [7]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [8]:
# テキストデータ読み込み関数
def load_file(path):
    with open(path) as fi:
        x, y = [], []
        for line in fi:
            title, category = line.strip().split('\t')
            tokens = nltk.word_tokenize(title)
            x.append(tokens)
            y.append(category)
        return x, y
    
    
# データセット作成クラス
class CreateDataset(Dataset):
    def __init__(self, x, y, vocab):
        self.x = x
        self.y = y
        self.vocab = vocab
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        return words2id(self.x[index], self.vocab), cat2id(self.y[index])

In [13]:
# テキストデータをtitle, categoryそれぞれのリストとして読み込み
train_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter06/work/train.txt'
valid_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter06/work/valid.txt'

x_train, y_train = load_file(train_path)
x_valid, y_valid = load_file(valid_path)

# CreateDatasetクラスのインスタンスを生成
train_dataset = CreateDataset(x_train, y_train, word2id)
valid_dataset = CreateDataset(x_valid, y_valid, word2id)
print(train_dataset[0])
print('train data: {} samples'.format(len(train_dataset)))

(tensor([2243, 3414,  388,  112,  381,   30, 4102, 1166]), tensor(0))
train data: 10684 samples


In [33]:
# ロスと正解率の計算を行う関数
def calc_loss_and_acc(model, dataset, criterion):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)  # シャッフルしない
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs, [inputs[0].shape[0]])
            total_loss += criterion(outputs, labels).item()
            
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()
        
    return total_loss / len(dataloader), correct / total


# train
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=False)
    logger.info('dataset loaded. ')
    logger.info('train data: {} samples'.format(len(train_dataset)))
    logger.info('valid data: {} samples'.format(len(valid_dataset)))
    logger.info('training start.')
    
    for epoch in range(epoch):
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()        
            outputs = model(inputs, [inputs[0].shape[0]])
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion)
        
        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))
    logger.info('training done.')

In [193]:
# 各種パラメータを設定
vocab_size = len(word2id) + 1
emb_size = 300
padding_idx = vocab_size - 1
output_size = 4
hidden_size = 50
lr = 0.005
epoch = 5
batch_size = 1

# モデル初期化
rnn = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(rnn.parameters(), lr=lr)

# train
train(rnn, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch)

[I 210118 16:02:00 <ipython-input-192-a966ca17b27e>:21] dataset loaded. 
[I 210118 16:02:00 <ipython-input-192-a966ca17b27e>:22] train data: 10684 samples
[I 210118 16:02:00 <ipython-input-192-a966ca17b27e>:23] valid data: 1336 samples
[I 210118 16:02:00 <ipython-input-192-a966ca17b27e>:24] training start.


epoch: 1 done. 
train loss: 0.8085591446159206	train acc: 0.7227630101085736
valid loss: 0.8557747856875558	valid acc: 0.7013473053892215
epoch: 2 done. 
train loss: 0.7315529478012676	train acc: 0.7323099962560838
valid loss: 0.8814415743882618	valid acc: 0.6833832335329342
epoch: 3 done. 
train loss: 0.5407061495804076	train acc: 0.7982029202545863
valid loss: 0.7041170801409704	valid acc: 0.749251497005988
epoch: 4 done. 
train loss: 0.5042316751375562	train acc: 0.8137401722201423
valid loss: 0.7490548531662166	valid acc: 0.7327844311377245


[I 210118 16:10:48 <ipython-input-192-a966ca17b27e>:39] training done.


epoch: 5 done. 
train loss: 0.4308029864901095	train acc: 0.8400411830774991
valid loss: 0.7236936965320834	valid acc: 0.7432634730538922


## 83. ミニバッチ化・GPU上での学習
問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．

- GPUの学習に使用したコードはsrcディレクトリにあります。（q83.py）
- batch_size=8, epoch=30, lr=0.005で実行。validでのaccが最大のチェックポイントのみ保存
    - 時間 7分33秒
    - train loss: 0.34684245115420537	train acc: 0.8740172220142269
    - valid loss: 0.6549646953585443	valid acc: 0.7776946107784432
    - save checkpoint epoch : 28 acc : 0.7776946107784432

In [14]:
# バッチ化するときには系列長に気をつける必要
# 入力データの長さが異なるため、バッチ内で系列長を揃えるためにpadding
# collate_fnを自作することで実装（バッチ作成時の挙動を制御）

class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)  # batch内の系列が降順になっているとpaddingの効率がいいらしい
        sequences = [x[0] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)  # padding
        labels = torch.LongTensor([x[1] for x in sorted_batch])
        lens = torch.LongTensor([x[0].shape[0] for x in sorted_batch])
        return sequences_padded, labels, lens

In [161]:
# もともとのdataloaderでどんな感じのデータが得られているのか？
for x,y in islice(DataLoader(train_dataset, batch_size=1, collate_fn=None), 5):
    print(x, y)

tensor([[2243, 3414,  388,  112,  381,   30, 4102, 1166]]) tensor([0])
tensor([[ 960, 1092,   10,  960, 1092,    6,    0, 6637,    5,   34]]) tensor([2])
tensor([[ 327,   97,  174,   41, 5105,  645,    9, 4103, 6638,  359]]) tensor([0])
tensor([[   0, 6639,  428,   33,  610,   31,    5,   88,  681,  103, 2012,    3,
           88,  681,   14, 4104,    5]]) tensor([2])
tensor([[1167, 1808,   10, 1167, 1808, 5106,  403,   49,  137, 3415]]) tensor([2])


In [162]:
# バッチ内の各サンプルの系列長の取得方法
for x in islice(loader = DataLoader(train_dataset, batch_size=1, collate_fn=None), 10):
    print('sequence length : {}'.format(x[0][0].shape[0]))

sequence len : 8
sequence len : 10
sequence len : 10
sequence len : 17
sequence len : 10
sequence len : 12
sequence len : 11
sequence len : 9
sequence len : 11
sequence len : 10


In [176]:
# paddingした結果
for d in islice(DataLoader(train_dataset, batch_size=4, collate_fn=Padsequence(9999)), 1):
    print(d[0])

tensor([[   0, 6639,  428,   33,  610,   31,    5,   88,  681,  103, 2012,    3,
           88,  681,   14, 4104,    5],
        [ 960, 1092,   10,  960, 1092,    6,    0, 6637,    5,   34, 9999, 9999,
         9999, 9999, 9999, 9999, 9999],
        [ 327,   97,  174,   41, 5105,  645,    9, 4103, 6638,  359, 9999, 9999,
         9999, 9999, 9999, 9999, 9999],
        [2243, 3414,  388,  112,  381,   30, 4102, 1166, 9999, 9999, 9999, 9999,
         9999, 9999, 9999, 9999, 9999]])


In [175]:
# paddingした結果
for d in islice(DataLoader(train_dataset, batch_size=4, collate_fn=Padsequence(9999)), 1):
    print(d[0])  # バッチの単語ID列
    print(d[1])  # バッチの正解ラベル
    print(d[2])  # バッチの元の単語数

tensor([[   0, 6639,  428,   33,  610,   31,    5,   88,  681,  103, 2012,    3,
           88,  681,   14, 4104,    5],
        [ 960, 1092,   10,  960, 1092,    6,    0, 6637,    5,   34, 9999, 9999,
         9999, 9999, 9999, 9999, 9999],
        [ 327,   97,  174,   41, 5105,  645,    9, 4103, 6638,  359, 9999, 9999,
         9999, 9999, 9999, 9999, 9999],
        [2243, 3414,  388,  112,  381,   30, 4102, 1166, 9999, 9999, 9999, 9999,
         9999, 9999, 9999, 9999, 9999]])
tensor([2, 2, 0, 0])
tensor([17, 10, 10,  8])


In [173]:
# これはbatch_first=Falseにした結果
# このようになっちゃう。batch_first=Trueにすることで1次元目にバッチサイズを持ってこれる
for d in islice(DataLoader(train_dataset, batch_size=4, collate_fn=Padsequence(9999)), 1):
    print(d)

(tensor([[   0,  960,  327, 2243],
        [6639, 1092,   97, 3414],
        [ 428,   10,  174,  388],
        [  33,  960,   41,  112],
        [ 610, 1092, 5105,  381],
        [  31,    6,  645,   30],
        [   5,    0,    9, 4102],
        [  88, 6637, 4103, 1166],
        [ 681,    5, 6638, 9999],
        [ 103,   34,  359, 9999],
        [2012, 9999, 9999, 9999],
        [   3, 9999, 9999, 9999],
        [  88, 9999, 9999, 9999],
        [ 681, 9999, 9999, 9999],
        [  14, 9999, 9999, 9999],
        [4104, 9999, 9999, 9999],
        [   5, 9999, 9999, 9999]]), tensor([2, 2, 0, 0]), tensor([17, 10, 10,  8]))


## 84. 単語ベクトルの導入
事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

- GPUの学習に使用したコードはsrcディレクトリにあります。（q84.py）
- batch_size=8, epoch=30, lr=0.005で実行。validでのaccが最大のチェックポイントのみ保存
    - train loss: 0.2578477888207111	train acc: 0.9141707225758143
    - valid loss: 0.3482527509035437	valid acc: 0.8802395209580839
    - save checkpoint epoch : 29 acc : 0.8802395209580839

In [204]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
print(len(model.vocab))

In [261]:
pretrained_emb_w = np.zeros((vocab_size, 300))
# vocabのIDに対応する単語を取得
for k,v in word2id.items():
    try:
        if v == 0:
            continue
        # 取得した単語の300次元のベクトルを取得
        # 取得したベクトルを重みベクトルの行に追加
        pretrained_emb[v] = model[k]
    except KeyError:
        continue
print(pretrained_emb_w.shape)

# vocab * emb_size(300)のnumpy行列ができる
# この行列をembedding層の重みにセットする
emb = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_emb_w), padding_idx=9816)

(9817, 300)


In [130]:
%%file 'src/q84.py'
import argparse
from os import path
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch.nn as nn
import nltk
import gensim
import numpy as np
from logzero import logger
nltk.download('punkt')

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


class CreateDataset(Dataset):
    def __init__(self, x, y, vocab):
        self.x = x
        self.y = y
        self.vocab = vocab

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        return words2id(self.x[index], self.vocab), cat2id(self.y[index])


class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, vec_path, word2id):
        super().__init__()
        self.to(device)
        self.hidden_size = hidden_size
        self.emb = nn.Embedding.from_pretrained(load_pretrained_vector(vec_path, word2id, vocab_size), padding_idx=padding_idx)
        #self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)  # paddingに0を使うと未知語と同じ扱いになっちゃうので、語彙数-1のIDをpadding_idxにする
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size, bias=True)

    def forward(self, x, x_len):
        self.batch_size = x.size()[0]
        hidden = torch.zeros(1, self.batch_size, self.hidden_size)
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, emb_size)
        packed = nn.utils.rnn.pack_padded_sequence(emb, x_len, batch_first=True, enforce_sorted=False)  # packing済み, 元の系列長→packed_sequence
        out, h_n = self.rnn(packed, hidden)
        # out.size() = (batch_size, seq_len, hidden_size)
        logit = self.fc(h_n[-1])
        # out.size() = (batch_size, output_size)
        return logit


class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)  # batch内の系列が降順になっているとpaddingの効率がいいらしい
        sequences = [x[0] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)  # padding
        labels = torch.LongTensor([x[1] for x in sorted_batch])
        lens = torch.LongTensor([x[0].shape[0] for x in sorted_batch])
        return sequences_padded, labels, lens


# text fileの読み込みを行う関数
def load_file(path):
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            x.append(nltk.word_tokenize(title))
            y.append(category)
        return x, y


# 単語のリストからid番号列に変換する関数
def words2id(words, word2id):
    return torch.tensor([word2id[word] if word in word2id.keys() else 0 for word in words])


# カテゴリ名からカテゴリidに変換する関数
def cat2id(y):
    cate = {'b':0, 't':1, 'e':2, 'm':3}
    return torch.tensor(cate[y], dtype=torch.int64)


# vocab fileの読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id


def load_pretrained_vector(vec_path, word2id, vocab_size):
    pretrained_vec_w = np.zeros((vocab_size, 300))
    model = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=True)
    logger.info('pretrained word2vec vocab size: {}'.format(len(model.vocab)))
    for key, value in word2id.items():
        try:
            if value == 0:continue  # 未知語はzeroのまま
            pretrained_vec_w[value] = model[key]
        except KeyError:  # 学習済みword2vecに含まれていない単語もzeroのまま
            continue
    return torch.from_numpy(pretrained_vec_w.astype(np.float32))



# loss, accを計算する関数
def calc_loss_and_acc(model, dataset, criterion):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs, [inputs[0].shape[0]])
            total_loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

    return total_loss / len(dataloader), correct / total


# 学習
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, device, model_path, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_acc = 0.

    for epoch in range(epoch):
        for data in train_dataloader:
            inputs, labels, inputs_len = data
            optimizer.zero_grad()
            inputs.to(device)
            labels.to(device)
            inputs_len.to(device)
            outputs = model(inputs, inputs_len)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion)

        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))

        # validのaccの最大値が更新されたらそのチェックポイントを保存
        if best_acc <= valid_acc:
            best_acc = valid_acc
            torch.save({
                'epoch':epoch+1,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict':optimizer.state_dict()
            }, model_path)
            print("save checkpoint epoch : {} acc : {}".format(epoch+1, valid_acc))



# argument
def create_parser():
    parser = argparse.ArgumentParser(description='hogehoge')
    parser.add_argument('--vocab_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/vocab.txt', type=path.abspath, help='Path to vocabulary file')
    parser.add_argument('--train_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/train.txt', type=path.abspath, help='Path to train data file')
    parser.add_argument('--valid_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/valid.txt', type=path.abspath, help='Path to valid data file')
    parser.add_argument('--model_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/q84_checkpoint_best.pt', type=path.abspath, help='Path to save checkpoint best model')
    parser.add_argument('--vec_path', default='/work01/y_kishinami/100knock-2020/chapter09/GoogleNews-vectors-negative300.bin', type=path.abspath, help='Path to pretrained word2vec')
    parser.add_argument('--emb_size', default=300, type=int, help='dimension of embedding layer')
    parser.add_argument('--output_size', default=4, type=int, help='dimension of output layer')
    parser.add_argument('--hidden_size', default=50, type=int, help='dimension of hidden layer')
    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
    parser.add_argument('--lr', default=0.005, type=float, help='learning late')
    parser.add_argument('--epoch', default=10, type=int, help='the number of epoch')

    return parser


def main():
    # argument
    parser = create_parser()
    args = parser.parse_args()
    logger.info(args)

    # 語彙のload（単語→idへの変換辞書）
    word2id = load_vocab(args.vocab_path)
    vocab_size = len(word2id) + 1
    logger.info('vocabulary loaded. vocab size: {}'.format(vocab_size))

    # datasetのload
    logger.info('dataset loading ...')
    x_train, y_train = load_file(args.train_path)
    x_valid, y_valid = load_file(args.valid_path)
    train_dataset = CreateDataset(x_train, y_train, word2id)
    valid_dataset = CreateDataset(x_valid, y_valid, word2id)
    logger.info('dataset loaded. ')
    logger.info('train data: {} samples'.format(len(train_dataset)))
    logger.info('valid data: {} samples'.format(len(valid_dataset)))

    # モデルの初期化
    rnn = RNN(vocab_size, args.emb_size, vocab_size-1, args.output_size, args.hidden_size, args.vec_path, word2id)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(rnn.parameters(), lr=args.lr)


    # 学習
    logger.info('training start !')
    train(rnn, train_dataset, valid_dataset, args.batch_size, criterion, optimizer, args.epoch, device, args.model_path, collate_fn=Padsequence(vocab_size-1))
    logger.info('training done !')


if __name__ == "__main__":
    main()

Overwriting src/q84.py


## 85. 双方向RNN・多層化
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．

- コードはsrcにおきました（q85.py）
- batch_size=64, epoch=5, lr=0.01, dropout=0.6, adagradで実行
    - train loss: 0.27643798515469065	train acc: 0.9082740546611756
    - valid loss: 0.33827637885619266	valid acc: 0.8877245508982036
    - save checkpoint epoch : 3 acc : 0.8877245508982036

In [131]:
%%file 'src/q85.py'
import argparse
from os import path
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch.nn as nn
import nltk
import gensim
import numpy as np
from logzero import logger
nltk.download('punkt')

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


class CreateDataset(Dataset):
    def __init__(self, x, y, vocab):
        self.x = x
        self.y = y
        self.vocab = vocab

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        return words2id(self.x[index], self.vocab), cat2id(self.y[index])


class biRNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, vec_path, word2id, num_layers, dropout):
        super().__init__()
        self.to(device)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.emb = nn.Embedding.from_pretrained(load_pretrained_vector(vec_path, word2id, vocab_size), padding_idx=padding_idx)  # paddingに0を使うと未知語と同じ扱いになっちゃうので、語彙数-1のIDをpadding_idxにする
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='relu', batch_first=True, bidirectional=True, num_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(2*hidden_size, output_size, bias=True)


    def forward(self, x, x_len):
        self.batch_size = x.size()[0]
        hidden = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size)
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, emb_size)
        packed = nn.utils.rnn.pack_padded_sequence(emb, x_len, batch_first=True, enforce_sorted=False)  # packing済み, 元の系列長→packed_sequence
        out, h_n = self.rnn(packed, hidden)
        # out.size() = (batch_size, seq_len, hidden_size)
        # h_n.size() = (順方向か逆方向か, batch_size, hidden_size)
        logit = self.fc(torch.cat([h_n[-2], h_n[-1]], dim=1))  # -2が最終層の順方向の隠れ層，-1が最終層の逆方向の隠れ層
        # out.size() = (batch_size, output_size)
        return logit


class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)  # batch内の系列が降順になっているとpaddingの効率がいいらしい
        sequences = [x[0] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)  # padding
        labels = torch.LongTensor([x[1] for x in sorted_batch])
        lens = torch.LongTensor([x[0].shape[0] for x in sorted_batch])
        return sequences_padded, labels, lens


# text fileの読み込みを行う関数
def load_file(path):
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            x.append(nltk.word_tokenize(title))
            y.append(category)
        return x, y


# 単語のリストからid番号列に変換する関数
def words2id(words, word2id):
    return torch.tensor([word2id[word] if word in word2id.keys() else 0 for word in words])


# カテゴリ名からカテゴリidに変換する関数
def cat2id(y):
    cate = {'b':0, 't':1, 'e':2, 'm':3}
    return torch.tensor(cate[y], dtype=torch.int64)


# vocab fileの読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id


def load_pretrained_vector(vec_path, word2id, vocab_size):
    pretrained_vec_w = np.zeros((vocab_size, 300))
    model = gensim.models.KeyedVectors.load_word2vec_format(vec_path, binary=True)
    logger.info('pretrained word2vec vocab size: {}'.format(len(model.vocab)))
    for key, value in word2id.items():
        try:
            if value == 0:continue  # 未知語はzeroのまま
            pretrained_vec_w[value] = model[key]
        except KeyError:  # 学習済みword2vecに含まれていない単語もzeroのまま
            continue
    return torch.from_numpy(pretrained_vec_w.astype(np.float32))



# loss, accを計算する関数
def calc_loss_and_acc(model, dataset, criterion):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs, [inputs[0].shape[0]])
            total_loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

    return total_loss / len(dataloader), correct / total


# 学習
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, device, model_path, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_acc = 0.

    for epoch in range(epoch):
        for data in train_dataloader:
            inputs, labels, inputs_len = data
            optimizer.zero_grad()
            inputs.to(device)
            labels.to(device)
            inputs_len.to(device)
            outputs = model(inputs, inputs_len)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion)

        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))

        # validのaccの最大値が更新されたらそのチェックポイントを保存
        if best_acc <= valid_acc:
            best_acc = valid_acc
            torch.save({
                'epoch':epoch+1,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict':optimizer.state_dict()
            }, model_path)
            print("save checkpoint epoch : {} acc : {}".format(epoch+1, valid_acc))



# argument
def create_parser():
    parser = argparse.ArgumentParser(description='hogehoge')
    parser.add_argument('--vocab_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/vocab.txt', type=path.abspath, help='Path to vocabulary file')
    parser.add_argument('--train_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/train.txt', type=path.abspath, help='Path to train data file')
    parser.add_argument('--valid_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/valid.txt', type=path.abspath, help='Path to valid data file')
    parser.add_argument('--model_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/q85_checkpoint_best.pt', type=path.abspath, help='Path to save checkpoint best model')
    parser.add_argument('--vec_path', default='/work01/y_kishinami/100knock-2020/chapter09/GoogleNews-vectors-negative300.bin', type=path.abspath, help='Path to pretrained word2vec')
    parser.add_argument('--emb_size', default=300, type=int, help='dimension of embedding layer')
    parser.add_argument('--output_size', default=4, type=int, help='dimension of output layer')
    parser.add_argument('--hidden_size', default=50, type=int, help='dimension of hidden layer')
    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
    parser.add_argument('--lr', default=0.005, type=float, help='learning late')
    parser.add_argument('--epoch', default=10, type=int, help='the number of epoch')
    parser.add_argument('--num_layers', default=1, type=int, help='the number of RNN layers')
    parser.add_argument('--dropout', default=0.6, help='dropout ratio')

    return parser


def main():
    # argument
    parser = create_parser()
    args = parser.parse_args()
    logger.info(args)

    # 語彙のload（単語→idへの変換辞書）
    word2id = load_vocab(args.vocab_path)
    vocab_size = len(word2id) + 1
    logger.info('vocabulary loaded. vocab size: {}'.format(vocab_size))

    # datasetのload
    logger.info('dataset loading ...')
    x_train, y_train = load_file(args.train_path)
    x_valid, y_valid = load_file(args.valid_path)
    train_dataset = CreateDataset(x_train, y_train, word2id)
    valid_dataset = CreateDataset(x_valid, y_valid, word2id)
    logger.info('dataset loaded. ')
    logger.info('train data: {} samples'.format(len(train_dataset)))
    logger.info('valid data: {} samples'.format(len(valid_dataset)))

    # モデルの初期化
    rnn = biRNN(vocab_size, args.emb_size, vocab_size-1, args.output_size, args.hidden_size, args.vec_path, word2id, args.num_layers, args.dropout)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adagrad(rnn.parameters(), lr_decay=0.001, lr=args.lr)

    # 学習
    logger.info('training start !')
    train(rnn, train_dataset, valid_dataset, args.batch_size, criterion, optimizer, args.epoch, device, args.model_path, collate_fn=Padsequence(vocab_size-1))
    logger.info('training done !')


if __name__ == "__main__":
    main()

Overwriting src/q85.py


## 86. 畳み込みニューラルネットワーク (CNN)
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列xからカテゴリyを予測するモデルを実装せよ．

ただし，畳み込みニューラルネットワークの構成は以下の通りとする．

- 単語埋め込みの次元数: dw
- 畳み込みのフィルターのサイズ: 3 トークン
- 畳み込みのストライド: 1 トークン
- 畳み込みのパディング: あり
- 畳み込み演算後の各時刻のベクトルの次元数: dh
- 畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文をdh次元の隠れベクトルで表現

すなわち，時刻tの特徴ベクトルpt∈ℝdhは次式で表される．

- pt=g(W(px)[emb(xt−1);emb(xt);emb(xt+1)]+b(p))

ただし，W(px)∈ℝdh×3dw,b(p)∈ℝdhはCNNのパラメータ，gは活性化関数（例えばtanhやReLUなど），[a;b;c]はベクトルa,b,cの連結である．なお，行列W(px)の列数が3dwになるのは，3個のトークンの単語埋め込みを連結したものに対して，線形変換を行うためである．

最大値プーリングでは，特徴ベクトルの次元毎に全時刻における最大値を取り，入力文書の特徴ベクトルc∈ℝdhを求める．c[i]でベクトルcのi番目の次元の値を表すことにすると，最大値プーリングは次式で表される．

- c[i]=max1≤t≤Tpt[i]

最後に，入力文書の特徴ベクトルcに行列W(yc)∈ℝL×dhとバイアス項b(y)∈ℝLによる線形変換とソフトマックス関数を適用し，カテゴリyを予測する．

- y=softmax(W(yc)c+b(y))

なお，この問題ではモデルの学習を行わず，ランダムに初期化された重み行列でyを計算するだけでよい．

In [29]:
class CNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, padding_idx, output_size, vocab_size, kernel_size=3):
        super().__init__()
        self.hidden_size = hidden_size  # 問題文のd_h
        self.embedding_size = embedding_size  # 単語埋め込みの次元数
        self.emb = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        self.cnn = nn.Conv1d(embedding_size, hidden_size, kernel_size, stride=1, padding=1)  
        # 順にinput_channel, output_channel, カーネルのサイズ（フィルタのサイズ）, 畳み込みのストライド, 畳み込みのpaddingの数（1なら系列の両端に1つずつpaddingが入る）
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, embbeding_size)
        # emb.transpose(-1, -2).size() = (batch_size, embedding_size, seq_len)
        conv = self.cnn(emb.transpose(-1, -2))  # seq_len方向に畳み込むために，seq_lenを最後の次元に持ってくる
        # conv.size() = (batch_size, hidden_size, 時刻数？(paddingの数によって変わるので...))
        act = F.relu(conv)
        # act.size() = (batch_size, hidden_size, 時刻数？)
        max_pool = nn.MaxPool1d(act.size()[-1])(act)
        # d_hの次元ごとに，全時刻の最大値をとってくる
        # max_pool.size() = (batch_size, hidden_size, 1)
        # squeezeはmax_poolの次元(batch_size, hidden_size, 1)の1が不要なので消してる
        logit = self.fc(torch.squeeze(max_pool, -1))
        return logit

In [30]:
vocab_size = len(word2id) + 1
emb_size = 300
padding_idx = len(word2id)  # 語彙数idxをpadding idxとして追加
output_size = 4  # カテゴリ数
hidden_size = 50


cnn = CNN(hidden_size, emb_size, padding_idx, output_size, vocab_size)

x = torch.tensor([[1557, 379, 10, 1557, 379], [47, 8988, 1906, 5, 30]])  # batch_size = 2, seq_len=5のsample
print(cnn(x).size()) # (batch_size, output_size)

torch.Size([2, 4])


In [129]:
# 10 sampleで実行
for x,y in islice(gen_sample(train_path), 10):
    pred = F.softmax(cnn(x.unsqueeze(0)), dim=-1)
    print('prediction: {}\tgold: {}'.format(torch.argmax(pred), y))

prediction: 1	gold: 0
prediction: 2	gold: 2
prediction: 2	gold: 0
prediction: 2	gold: 2
prediction: 2	gold: 2
prediction: 2	gold: 2
prediction: 2	gold: 1
prediction: 2	gold: 3
prediction: 2	gold: 0
prediction: 2	gold: 2


- https://programmer.group/pytorch-learning-conv1d-conv2d-and-conv3d.html

## 87. 確率的勾配降下法によるCNNの学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

logはtensorboardで保存するようにした
- 50 epoch, SGD, batch size 128
    - train acc 0.9342
    - valid acc 0.8106

In [31]:
%%file 'src/q87.py'
import argparse
from os import path
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch.nn as nn
import torch.nn.functional as F
import nltk
from logzero import logger
from torch.utils.tensorboard import SummaryWriter
nltk.download('punkt')

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


class CreateDataset(Dataset):
    def __init__(self, x, y, vocab):
        self.x = x
        self.y = y
        self.vocab = vocab

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        return words2id(self.x[index], self.vocab), cat2id(self.y[index])


class CNN(nn.Module):
    def __init__(self, hidden_size, embedding_size, padding_idx, output_size, vocab_size, kernel_size=3):
        super().__init__()
        self.hidden_size = hidden_size  # 問題文のd_h
        self.embedding_size = embedding_size  # 単語埋め込みの次元数
        self.emb = nn.Embedding(vocab_size, embedding_size, padding_idx=padding_idx)
        self.cnn = nn.Conv1d(embedding_size, hidden_size, kernel_size, stride=1, padding=1)
        # 順にinput_channel, output_channel, カーネルのサイズ（フィルタのサイズ）, 畳み込みのストライド, 畳み込みのpaddingの数（1なら系列の両端に1つずつpaddingが入る）
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, embbeding_size)
        # emb.transpose(-1, -2).size() = (batch_size, embedding_size, seq_len)
        conv = self.cnn(emb.transpose(-1, -2))  # seq_len方向に畳み込むために，seq_lenを最後の次元に持ってくる
        # conv.size() = (batch_size, hidden_size, 時刻数？(paddingの数によって変わるので...))
        act = F.relu(conv)
        # act.size() = (batch_size, hidden_size, 時刻数？)
        max_pool = nn.MaxPool1d(act.size()[-1])(act)
        # d_hの次元ごとに，全時刻の最大値をとってくる
        # max_pool.size() = (batch_size, hidden_size)
        # squeezeはmax_poolの次元(batch_size, hidden_size, 1)の1が不要なので消してる
        logit = self.fc(torch.squeeze(max_pool, -1))
        return logit


class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)  # batch内の系列が降順になっているとpaddingの効率がいいらしい
        sequences = [x[0] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)  # padding
        labels = torch.LongTensor([x[1] for x in sorted_batch])
        return sequences_padded, labels


# text fileの読み込みを行う関数
def load_file(path):
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            x.append(nltk.word_tokenize(title))
            y.append(category)
        return x, y


# 単語のリストからid番号列に変換する関数
def words2id(words, word2id):
    return torch.tensor([word2id[word] if word in word2id.keys() else 0 for word in words])


# カテゴリ名からカテゴリidに変換する関数
def cat2id(y):
    cate = {'b':0, 't':1, 'e':2, 'm':3}
    return torch.tensor(cate[y], dtype=torch.int64)


# vocab fileの読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id


# loss, accを計算する関数
def calc_loss_and_acc(model, dataset, criterion):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            total_loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

    return total_loss / len(dataloader), correct / total


# 学習
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, device, model_path, writer, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_acc = 0.

    for epoch in range(epoch):
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            inputs.to(device)
            labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion)

        # q87追記：tensorboardでlogを保存
        writer.add_scalar('train_loss', train_loss, epoch + 1)
        writer.add_scalar('train_acc', train_acc, epoch + 1)
        writer.add_scalar('valid_loss', valid_loss, epoch + 1)
        writer.add_scalar('valid_acc', valid_acc, epoch + 1)
        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))

        # validのaccの最大値が更新されたらそのチェックポイントを保存
        if best_acc <= valid_acc:
            best_acc = valid_acc
            torch.save({
                'epoch':epoch+1,
                'model_state_dict':model.state_dict(),
                'optimizer_state_dict':optimizer.state_dict()
            }, model_path)
            print("save checkpoint epoch : {} acc : {}".format(epoch+1, valid_acc))

# argument
def create_parser():
    parser = argparse.ArgumentParser(description='hogehoge')
    parser.add_argument('--vocab_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/vocab.txt', type=path.abspath, help='Path to vocabulary file')
    parser.add_argument('--train_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/train.txt', type=path.abspath, help='Path to train data file')
    parser.add_argument('--valid_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/valid.txt', type=path.abspath, help='Path to valid data file')
    parser.add_argument('--model_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/checkpoint_best_q87.pt', type=path.abspath, help='path to save checkpoint best model')
    parser.add_argument('--log_dir', default='/work01/y_kishinami/100knock-2020/chapter09/work/logs/q87', type=path.abspath, help='tensorboard log dir')
    parser.add_argument('--emb_size', default=300, type=int, help='dimension of embedding layer')
    parser.add_argument('--output_size', default=4, type=int, help='dimension of output layer')
    parser.add_argument('--hidden_size', default=50, type=int, help='dimension of hidden layer')
    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
    parser.add_argument('--lr', default=0.005, type=float, help='learning late')
    parser.add_argument('--epoch', default=10, type=int, help='the number of epoch')

    return parser


def main():
    # argument
    parser = create_parser()
    args = parser.parse_args()
    logger.info(args)
    writer = SummaryWriter(log_dir=args.log_dir)

    # 語彙のload（単語→idへの変換辞書）
    word2id = load_vocab(args.vocab_path)
    vocab_size = len(word2id) + 1
    logger.info('vocabulary loaded. vocab size: {}'.format(vocab_size))

    # datasetのload
    logger.info('dataset loading ...')
    x_train, y_train = load_file(args.train_path)
    x_valid, y_valid = load_file(args.valid_path)
    train_dataset = CreateDataset(x_train, y_train, word2id)
    valid_dataset = CreateDataset(x_valid, y_valid, word2id)
    logger.info('dataset loaded. ')
    logger.info('train data: {} samples'.format(len(train_dataset)))
    logger.info('valid data: {} samples'.format(len(valid_dataset)))

    # モデルの初期化
    cnn = CNN(args.hidden_size, args.emb_size, vocab_size-1, args.output_size, vocab_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(cnn.parameters(), lr=args.lr)


    # 学習
    logger.info('training start !')
    train(cnn, train_dataset, valid_dataset, args.batch_size, criterion, optimizer, args.epoch, device, args.model_path, writer, collate_fn=Padsequence(vocab_size-1))
    logger.info('training done !')


if __name__ == "__main__":
    main()

Writing src/q87.py


## 88. パラメータチューニング
問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ．

1. RNNのハイパラ探索設定
- 探索パラメータ
    - learning rate
    - batch size
    - num layers
    - drop out ratio
- 固定パラメータ
    - emb size = 300(pretrained word2vec)
    - output size = 4
    - bidirectional = True
    - epoch size = 5
    - criterion = CrossEntropyLoss
    - optimizer = Adagrad
2. RNNのハイパラ探索結果
- Optunaを使って1時間探索
- 最適パラメータ
    - {'learning_rate': 0.018291482272472147, 'batch_size': 32.0, 'num_layers': 1, 'drop_out': 0.7113535005616372, 'hidden_size': 256}
    - train loss: 0.12620148336212159	train acc: 0.9593785099213777 (5 epoch)
    - valid loss: 0.2780322267644646	valid acc: 0.9086826347305389 (5 epoch)

In [40]:
%%file 'src/q88.py' 
import optuna
from logzero import logger
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch.nn as nn
import nltk
import numpy as np
nltk.download('punkt')

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

class CreateDataset(Dataset):
    def __init__(self, x, y, vocab):
        self.x = x
        self.y = y
        self.vocab = vocab

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        return words2id(self.x[index], self.vocab), cat2id(self.y[index])


class biRNN(nn.Module):
    def __init__(self, emb_size, padding_idx, output_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.to(device)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.emb = nn.Embedding.from_pretrained(torch.from_numpy(np.load('/work01/y_kishinami/100knock-2020/chapter09/work/pretrained_vec.npy').astype(np.float32)), padding_idx=padding_idx)
        # paddingに0を使うと未知語と同じ扱いになっちゃうので、語彙数-1のIDをpadding_idxにする
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='relu', batch_first=True, bidirectional=True, num_layers=num_layers, dropout=dropout)
        self.fc = nn.Linear(2*hidden_size, output_size, bias=True)


    def forward(self, x, x_len):
        self.batch_size = x.size()[0]
        hidden = torch.zeros(2*self.num_layers, self.batch_size, self.hidden_size)
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, emb_size)
        packed = nn.utils.rnn.pack_padded_sequence(emb, x_len, batch_first=True, enforce_sorted=False)  # packing済み, 元の系列長→packed_sequence
        out, h_n = self.rnn(packed, hidden)
        # out.size() = (batch_size, seq_len, hidden_size)
        # h_n.size() = (順方向か逆方向か, batch_size, hidden_size)
        logit = self.fc(torch.cat([h_n[-2], h_n[-1]], dim=1))  # -2が最終層の順方向の隠れ層，-1が最終層の逆方向の隠れ層
        # out.size() = (batch_size, output_size)
        return logit


class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True)  # batch内の系列が降順になっているとpaddingの効率がいいらしい
        sequences = [x[0] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)  # padding
        labels = torch.LongTensor([x[1] for x in sorted_batch])
        lens = torch.LongTensor([x[0].shape[0] for x in sorted_batch])
        return sequences_padded, labels, lens


# text fileの読み込みを行う関数
def load_file(path):
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            x.append(nltk.word_tokenize(title))
            y.append(category)
        return x, y


# 単語のリストからid番号列に変換する関数
def words2id(words, word2id):
    return torch.tensor([word2id[word] if word in word2id.keys() else 0 for word in words])


# カテゴリ名からカテゴリidに変換する関数
def cat2id(y):
    cate = {'b':0, 't':1, 'e':2, 'm':3}
    return torch.tensor(cate[y], dtype=torch.int64)


# vocab fileの読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id


# vocabのload
WORD2ID = load_vocab('/work01/y_kishinami/100knock-2020/chapter09/work/vocab.txt')
logger.info('vocabulary loaded. vocab size: {}'.format(len(WORD2ID) + 1))

# datasetのload
logger.info('dataset loading ...')
X_train, Y_train = load_file('/work01/y_kishinami/100knock-2020/chapter09/work/train.txt')
X_valid, Y_valid = load_file('/work01/y_kishinami/100knock-2020/chapter09/work/valid.txt')
Train_dataset = CreateDataset(X_train, Y_train, WORD2ID)
Valid_dataset = CreateDataset(X_valid, Y_valid, WORD2ID)
logger.info('dataset loaded.')


# loss, accを計算する関数
def calc_loss_and_acc(model, dataset, criterion):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs, [inputs[0].shape[0]])
            total_loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

    return total_loss / len(dataloader), correct / total


# 学習
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, device, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    best_loss = float('inf')

    for epoch in range(epoch):
        for data in train_dataloader:
            inputs, labels, inputs_len = data
            optimizer.zero_grad()
            inputs.to(device)
            labels.to(device)
            inputs_len.to(device)
            outputs = model(inputs, inputs_len)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion)

        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))

        best_loss = min(best_loss, valid_loss)
        if valid_loss == best_loss:
            print('best loss: {}'.format(best_loss))

    return best_loss  # validのlossの最小値を返す


def objective(trial):
    # 1回の組み合わせについての時間指定も可能
    # 探索パラメータ
    learning_rate = trial.suggest_loguniform('learning_rate', 5e-4, 5e-2)
    batch_size = int(trial.suggest_discrete_uniform('batch_size', 16, 128, 16))
    num_layers = int(trial.suggest_int('num_layers', 1, 5))
    drop_out = trial.suggest_uniform('drop_out', 0.0, 1.0)
    hidden_size = trial.suggest_categorical('hidden_size', [50, 128, 256])

    # 固定パラメータ
    EMB_SIZE = 300
    VOCAB_SIZE = len(WORD2ID) + 1
    PADDING_IDX = VOCAB_SIZE - 1
    OUTPUT_SIZE = 4
    NUM_EPOCHS = 5

    model = biRNN(EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, hidden_size, num_layers, drop_out)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adagrad(model.parameters(), lr_decay=0.001, lr=learning_rate)
    valid_loss = train(model, Train_dataset, Valid_dataset, batch_size, criterion, optimizer, NUM_EPOCHS, device, collate_fn=Padsequence(VOCAB_SIZE-1))
    return valid_loss


def main():
    study = optuna.create_study()
    study.optimize(objective, timeout=3600)  # 1時間できる
    # pandasとかで一覧が出せるので，それをみるのは面白いかも

    # print result
    print('best trial:')
    trial = study.best_trial
    print(trial)
    print(study.best_params)

if __name__ == "__main__":
    main()

Writing src/q88.py


## 89. 事前学習済み言語モデルからの転移学習
事前学習済み言語モデル（例えばBERTなど）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．

In [29]:
# テキストデータをtitle, categoryそれぞれのリストとして読み込み
train_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter06/work/train.txt'
valid_path = '/Users/y_kishinami/Documents/100knock-2020/y-kishinami/chapter06/work/valid.txt'

def load_file_q89(path):
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            #x.append(nltk.word_tokenize(title))
            x.append(title)
            y.append(category)
        return x, y

x_train, y_train = load_file_q89(train_path)
x_valid, y_valid = load_file_q89(valid_path)
print(x_train[0])
ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x_train[0]))
print(ids)
print(tokenizer.convert_ids_to_tokens(ids))

Greek 10-year yields rise day after five-year sale
[3306, 2184, 1011, 2095, 16189, 4125, 2154, 2044, 2274, 1011, 2095, 5096]
['greek', '10', '-', 'year', 'yields', 'rise', 'day', 'after', 'five', '-', 'year', 'sale']


In [10]:
import transformers
from transformers import BertTokenizer, BertModel

class CreateDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_len):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        text = self.x[index]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )
        return torch.LongTensor(inputs['input_ids']), torch.LongTensor(inputs['attention_mask']), cat2id(self.y[index])

In [16]:
# nltkでトークナイズしたものをBertTokenizerに突っ込んだパターン
max_len = 24
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_train = CreateDataset(x_train, y_train, tokenizer, max_len)
train_dataloader = DataLoader(dataset_train, batch_size=2)
for data in islice(train_dataloader, 1):
    print(data[0][0])
    print(tokenizer.convert_ids_to_tokens(data[0][0]))

tensor([  101,   100,   100, 16189,  4125,  2154,  2044,   100,  5096,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
['[CLS]', '[UNK]', '[UNK]', 'yields', 'rise', 'day', 'after', '[UNK]', 'sale', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [30]:
# titileをそのままBertTokenizerに突っ込んだパターン
max_len = 24
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_train = CreateDataset(x_train, y_train, tokenizer, max_len)
train_dataloader = DataLoader(dataset_train, batch_size=2)
for data in islice(train_dataloader, 1):
    print(data[0][0])
    print(tokenizer.convert_ids_to_tokens(data[0][0]))

tensor([  101,  3306,  2184,  1011,  2095, 16189,  4125,  2154,  2044,  2274,
         1011,  2095,  5096,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
['[CLS]', 'greek', '10', '-', 'year', 'yields', 'rise', 'day', 'after', 'five', '-', 'year', 'sale', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']




In [24]:
class BERTClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(drop_rate)
        self.fc = torch.nn.Linear(768, otuput_size)

    def forward(self, ids, mask):
        outputs = self.bert(ids, attention_mask=mask)
        logit = self.fc(outputs["pooler_output"])
        return logit

In [25]:
dropout = 0.4
output_size = 4
bert = BERTClass(dropout, output_size)

ids = torch.tensor([[101, 100, 100, 16189, 4125, 2154, 2044, 100, 5096, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
label = torch.tensor(0)

bert(ids, mask)

tensor([[-0.3950,  0.0542,  0.5064,  0.1149]], grad_fn=<AddmmBackward>)

#### 実際にGPU上での学習に使用したスクリプト

In [31]:
%%file src/q89.py
import argparse
from os import path
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from collections import defaultdict
import torch.nn as nn
import nltk
from transformers import BertTokenizer, BertModel
from logzero import logger
from torch.utils.tensorboard import SummaryWriter
nltk.download('punkt')

# device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


class CreateDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_len=50):
        self.x = x
        self.y = y
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):  # CreateDataset()[index]で返ってくる値を定義
        inputs = self.tokenizer.encode_plus(
            self.x[index],
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True
        )
        return torch.LongTensor(inputs['input_ids']), torch.LongTensor(inputs['attention_mask']), self.y[index]

# text fileの読み込みを行う関数
def load_file(path):
    cate = {'b': 0, 't': 1, 'e': 2, 'm': 3}
    with open(path) as fi:
        x, y = [],[]
        for line in fi:
            title, category = line.strip().split('\t')
            x.append(title)
            y.append(cate[category])
        return x, torch.Tensor(y).long()

# vocab fileの読み込みを行う関数
def load_vocab(vocab_path):
    word2id = defaultdict(int)
    with open(vocab_path) as vocab_file:
        for line in vocab_file:
            word, ids = line.strip().split('\t')
            word2id[word] = int(ids)
    return word2id


class BERTClass(torch.nn.Module):
    def __init__(self, drop_rate, otuput_size):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(p=drop_rate)
        self.softmax = nn.LogSoftmax(dim=1)
        self.fc = torch.nn.Linear(768, otuput_size)

    def forward(self, ids, mask):
        outputs = self.bert(ids, attention_mask=mask)
        logit = self.fc(self.drop(outputs["pooler_output"]))
        return logit


# loss, accを計算する関数
def calc_loss_and_acc(model, dataset, criterion, batch_size, device):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    total_loss, total, correct = 0., 0, 0
    with torch.no_grad():
        for data in dataloader:
            inputs = data[0].to(device)
            mask = data[1].to(device)
            labels = data[2].to(device)
            outputs = model(inputs, mask)
            total_loss += criterion(outputs, labels).item()
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()
    return total_loss / len(dataloader), correct / total


# 学習
def train(model, train_dataset, valid_dataset, batch_size, criterion, optimizer, epoch, device, writer, collate_fn=None):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    for epoch in range(epoch):
        for data in train_dataloader:
            inputs = data[0]
            mask = data[1]
            labels = data[2]
            optimizer.zero_grad()
            inputs = inputs.to(device)
            mask = mask.to(device)
            labels = labels.to(device)
            outputs = model(inputs, mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        train_loss, train_acc = calc_loss_and_acc(model, train_dataset, criterion, batch_size, device)
        valid_loss, valid_acc = calc_loss_and_acc(model, valid_dataset, criterion, batch_size, device)

        writer.add_scalar('train_loss', train_loss, epoch + 1)
        writer.add_scalar('train_acc', train_acc, epoch + 1)
        writer.add_scalar('valid_loss', valid_loss, epoch + 1)
        writer.add_scalar('valid_acc', valid_acc, epoch + 1)
        print('epoch: {} done. '.format(epoch + 1))
        print('train loss: {}\ttrain acc: {}'.format(train_loss, train_acc))
        print('valid loss: {}\tvalid acc: {}'.format(valid_loss, valid_acc))


# argument
def create_parser():
    parser = argparse.ArgumentParser(description='hogehoge')
    parser.add_argument('--vocab_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/vocab.txt', type=path.abspath, help='Path to vocabulary file')
    parser.add_argument('--train_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/train.txt', type=path.abspath, help='Path to train data file')
    parser.add_argument('--valid_path', default='/work01/y_kishinami/100knock-2020/chapter09/work/valid.txt', type=path.abspath, help='Path to valid data file')
    parser.add_argument('--log_dir', default='/work01/y_kishinami/100knock-2020/chapter09/work/logs/q89', type=path.abspath, help='tensorboard log dir')
    parser.add_argument('--output_size', default=4, type=int, help='dimension of output layer')
    parser.add_argument('--hidden_size', default=50, type=int, help='dimension of hidden layer')
    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
    parser.add_argument('--lr', default=0.00001, type=float, help='learning late')
    parser.add_argument('--epoch', default=10, type=int, help='the number of epoch')

    return parser


def main():
    # argument
    parser = create_parser()
    args = parser.parse_args()
    logger.info(args)
    writer = SummaryWriter(log_dir=args.log_dir)

    # 語彙のload（単語→idへの変換辞書）
    word2id = load_vocab(args.vocab_path)
    vocab_size = len(word2id) + 1
    logger.info('vocabulary loaded. vocab size: {}'.format(vocab_size))

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # datasetのload
    logger.info('dataset loading ...')
    x_train, y_train = load_file(args.train_path)
    x_valid, y_valid = load_file(args.valid_path)
    train_dataset = CreateDataset(x_train, y_train, tokenizer)
    valid_dataset = CreateDataset(x_valid, y_valid, tokenizer)
    logger.info('dataset loaded. ')
    logger.info('train data: {} samples'.format(len(train_dataset)))
    logger.info('valid data: {} samples'.format(len(valid_dataset)))


    # モデルの初期化
    bert = BERTClass(0.4, args.output_size).to(device)
    criterion = nn.CrossEntropyLoss()
    #optimizer = torch.optim.SGD(bert.parameters(), lr=0.001, momentum=0.9)
    optimizer = torch.optim.AdamW(bert.parameters(), lr=0.00001)

    # 学習
    logger.info('training start !')
    train(bert, train_dataset, valid_dataset, args.batch_size, criterion, optimizer, args.epoch, device, writer)
    logger.info('training done !')

if __name__ == "__main__":
    main()


Overwriting src/q89.py
