In [1]:
!pip install janome



In [2]:
import os,sys
import json
import numpy as np
import pandas as pd

from janome.tokenizer import Tokenizer
import re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.utils.data import DataLoader

from torchtext.legacy import data
from torchtext.legacy import datasets
from torchtext.vocab import FastText
import torchtext.legacy as torchtext

from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!nvidia-smi

Sat Sep 11 09:53:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
# 訓練に使用するデバイス
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using {} device'.format(device))

Using cuda device


In [6]:
# dataclean 
# 参考 : 
# 1.https://qiita.com/Hironsan/items/2466fe0f344115aff177
# 2.https://www.oio-blog.com/contents/preprocessing
# tokenizerの定義
def clean_tokenizer(text):
  j_t = Tokenizer()
  # 記号除去
  pattern = '[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”◇ᴗ●↓→♪★⊂⊃※△□◎〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％＞＜]'
  text = re.sub(pattern, '', text)
  return [tok for tok in j_t.tokenize(text, wakati=True)]

# test
print(clean_tokenizer("お姉さんはお元気ですか、スティーブ。"))

['お', '姉さん', 'は', 'お', '元気', 'です', 'か', 'スティーブ']


In [7]:
def label_preprocessing(label):
  if label == 'how':
    return 0
  elif label == 'what':
    return 1
  elif label == 'when':
    return 2
  elif label == 'where':
    return 3
  elif label == 'who':
    return 4
  elif label == 'why':
    return 5
  elif label == 'plain':
    return 6
  elif label == 'please':
    return 7
  else:
    return 8

In [8]:
# Dataset(train_dataset)のFieldを定義
text_field = torchtext.data.Field(
    init_token = '<SOS>',
    eos_token = '<EOS>',
    use_vocab = True,
    lower = True,
    tokenize = clean_tokenizer,
    is_target = False
)
# Dataset(test_dataset)のFieldを定義
label_field = torchtext.data.Field(
    sequential = False,
    use_vocab = False,
    preprocessing = torchtext.data.Pipeline(label_preprocessing),
    is_target = True
)

# csvファイルのロード
dataset = torchtext.data.TabularDataset(
    path = "drive/My Drive/kenkyu/data/data_augment.csv",
    format = 'csv',
    fields = [('data', text_field), ('label', label_field)],
    skip_header = False
)
# train_dataset : 5955
# test_dataset : 1489
train_dataset, test_dataset = dataset.split(
    split_ratio = 0.8
)
print('train_dataset : {}'.format(len(train_dataset)))
print('test_dataset : {}'.format(len(test_dataset)))

train_dataset : 5950
test_dataset : 1488


In [9]:
# vocabularyの作成
# 学習済み単語分散表現としてFastTextを使用
text_field.build_vocab(train_dataset, vectors = FastText(language = 'ja'), min_freq = 2)
text_field.vocab.vectors.size()

torch.Size([2529, 300])

In [10]:
# integer to string 辞書
print(text_field.vocab.itos)
# string to integer 辞書
print(text_field.vocab.stoi)
# 単語カウント結果
print(text_field.vocab.freqs)
# vectors
print(text_field.vocab.vectors)
# size of vectors
text_field.vocab.vectors.size()
# train_data
for idx in range(10):
  print('label : {0}, data : {1}'.format(vars(train_dataset[idx])['label'], vars(train_dataset[idx])['data']))

['<unk>', '<pad>', '<SOS>', '<EOS>', 'の', 'は', 'か', 'に', 'を', 'て', 'た', 'です', 'が', 'ます', 'し', 'どこ', 'ん', 'で', 'な', 'どう', 'いる', 'と', '何', 'い', 'だれ', 'う', 'こと', 'だ', 'あなた', 'ない', 'いつ', 'する', 'たち', 'いい', 'この', 'つ', 'も', 'ある', 'お', 'から', 'たら', '私', 'よ', 'でしょ', 'わたし', '人', 'だい', 'それ', 'ませ', 'まし', 'いただけ', 'へ', 'その', 'どうして', 'ね', 'たい', 'かしら', 'だっ', 'なぜ', 'これ', 'よう', 'どんな', 'だろ', 'ここ', 'さん', 'くれ', 'ば', 'どの', 'やっ', 'あり', 'もらえ', '彼', '行く', '行っ', '誰', 'なっ', 'って', 'について', 'まで', 'ぼく', '彼女', 'れ', '言っ', '来', 'あの', 'でき', 'くらい', 'ご', 'さ', 'なかっ', '行き', '今', 'どういう', 'もの', '時', '自分', '時間', '君', 'ため', 'できる', 'でし', '1', 'いかが', '思い', 'そう', '教え', 'くれる', 'み', 'わ', '的', 'かい', '好き', '見', 'られ', 'いっしょ', 'お願い', '仕事', '日本', '\u3000', 'きみ', 'き', '話し', '電話', 'なり', '先生', 'アメリカ', '新しい', '…', 'だけ', '思う', '持っ', '日', '次', '写真', 'いくら', 'てる', 'ほか', 'そこ', 'や', '必要', '知っ', 'とき', 'なく', 'という', 'ましょ', 'られる', '中', '店', 'なる', '前', '家', '何で', '意味', '話', 'じゃ', '手', '書い', '食べ', ' ', 'すれ', 'ところ', '者', 'ちゃん', 'つもり', 'でも', '僕', '映画', '住ん

In [11]:
class LSTM_Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size, device = None, emb_vec = None):
        
        super(LSTM_Classifier, self).__init__()
        
        self.model_type = 'LSTM'
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.device = device
        self.emb_vec = emb_vec

        # define model
        self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.embedding_layer.weight.data.copy_(self.emb_vec)
        self.lstm_layer = nn.LSTM(self.embedding_dim, self.hidden_dim)
        self.hidden2out_layer = nn.Linear(self.hidden_dim, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def init_hidden(self, batch_size):
        h0 = autograd.Variable(torch.randn(1, batch_size, self.hidden_dim))
        c0 = autograd.Variable(torch.randn(1, batch_size, self.hidden_dim))
        if self.device != None:
          h0 = h0.to(self.device)
          c0 = c0.to(self.device)
        return (h0, c0)

    def forward(self, batch):
        self.hidden = self.init_hidden(batch.size(-1))
        embeddings = self.embedding_layer(batch)
        #embeddings = embeddings.view(len(batch), 1, -1)
        lstm_out, (ht, ct) = self.lstm_layer(embeddings, self.hidden)
        # lstm_out = lstm_out[0].view(-1, self.hidden_dim)
        output = self.hidden2out_layer(ht[-1])
        output = self.softmax(output)

        return output

In [15]:
# 単語埋め込み次元
embedding_dim = 300
# LSTMの隠れ層の次元
hidden_dim = 128
# dataset全体の単語数
vocab_size = len(text_field.vocab)
# 予測class数
class_num = 9
# 学習係数 
lr = 0.001
# エポック数
epochs = 60
# model
model = LSTM_Classifier(embedding_dim, hidden_dim, vocab_size, class_num, device, text_field.vocab.vectors)
model.to(device)
# バッチサイズ
batch_size = 4
# loss func
criterion = torch.nn.CrossEntropyLoss().to(device)
# optimizer
optimizer = optim.Adam(model.parameters(), lr = lr)
# scheduler
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [16]:
def train_func(_train):
  model.train()   # modelを訓練モードに
  train_loss = 0.
  train_acc = 0.
  # mini-batch生成
  train_dataloader = torchtext.data.BucketIterator(
      _train,
      batch_size = batch_size,
      sort_key = lambda x: len(x.data),
      device = device,
      repeat = False
  )
  for batch in train_dataloader:
    text, label = batch.data, batch.label
    optimizer.zero_grad()
    output = model(text)
    loss = criterion(output, label)
    train_loss += loss.item()
    loss.backward()
    optimizer.step()
    train_acc += (output.argmax(1) == label).sum().item()
  return train_loss / len(_train), train_acc / len(_train) 


def evaluate_func(_test):
  model.eval()    # modelを推論モードに
  loss = 0.
  acc = 0.
  #mini-batch生成
  test_dataloader = torchtext.data.BucketIterator(
      test_dataset,
      batch_size = batch_size,
      sort_key = lambda x: len(x.data),
      device = device,
      repeat = False
  )
  for batch in test_dataloader:
    text, label = batch.data, batch.label
    with torch.no_grad():
      output = model(text)
      loss = criterion(output, label)
      loss += loss.item()
      acc += (output.argmax(1) == label).sum().item()
  return loss / len(_test), acc / len(_test)

for e in range(epochs):
  loss, acc = train_func(train_dataset)
  print('epoch : {}'.format(e))
  print('train_loss : {0}, train_acc : {1}'.format(loss, acc))

loss, acc = evaluate_func(test_dataset)
print('tes_loss : {0} test_acc : {1}'.format(loss, acc))


epoch : 0
0.5176109016041796 0.20470588235294118
epoch : 1
0.4873287483824401 0.25563025210084034
epoch : 2
0.47138992110220324 0.2752941176470588
epoch : 3
0.4599766258231732 0.29126050420168065
epoch : 4
0.42245340499557366 0.3665546218487395
epoch : 5
0.36351098795898823 0.4403361344537815
epoch : 6
0.32743947366706466 0.48369747899159665
epoch : 7
0.3037175061547456 0.5126050420168067
epoch : 8
0.29561591937261467 0.5324369747899159
epoch : 9
0.27619129512239904 0.5621848739495798
epoch : 10
0.26807599743624694 0.5726050420168067
epoch : 11
0.25947954138286977 0.5905882352941176
epoch : 12
0.2545074642956758 0.6020168067226891
epoch : 13
0.24778911700644413 0.6152941176470588
epoch : 14
0.24043824092743277 0.6238655462184874
epoch : 15
0.24048581703307748 0.6284033613445378
epoch : 16
0.23129674720663984 0.6534453781512605
epoch : 17
0.21570071533823212 0.6831932773109244
epoch : 18
0.20305475039141518 0.706890756302521
epoch : 19
0.19070180424982133 0.7272268907563025
epoch : 20
0