## モジュールの準備

In [1]:
# spacy/ginza
!pip install ginza
# ginzaのためのパッケージリソース一覧をリロード
# https://www.sololance.tokyo/2019/10/colab-load-ginza.html
import pkg_resources, imp
imp.reload(pkg_resources)

Collecting ginza
  Downloading https://files.pythonhosted.org/packages/a6/cc/5d3a9230cf3dd8426d0fc147133eb49913acdb8a6c8828320a7c8e2ae8b9/ginza-4.0.5.tar.gz
Collecting spacy<3.0.0,>=2.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/10/b5/c7a92c7ce5d4b353b70b4b5b4385687206c8b230ddfe08746ab0fd310a3a/spacy-2.3.2-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 10.0MB 9.5MB/s 
[?25hCollecting ja_ginza<4.1.0,>=4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/89/7d/c8778b5472082da8488b2686cbab3e34ab371fa038c1a712a8d55d8dba2b/ja_ginza-4.0.0.tar.gz (51.5MB)
[K     |████████████████████████████████| 51.5MB 56kB/s 
[?25hCollecting SudachiPy>=0.4.9
[?25l  Downloading https://files.pythonhosted.org/packages/71/18/8531c4a1c904cb24d37a09e7dff3273857ee65e0d0adb62d09c6f5492f5d/SudachiPy-0.4.9.tar.gz (67kB)
[K     |████████████████████████████████| 71kB 8.0MB/s 
[?25hCollecting SudachiDict-core>=20200330
  Downloading https://fil

<module 'pkg_resources' from '/usr/local/lib/python3.6/dist-packages/pkg_resources/__init__.py'>

## 単語ベクトルの確認

In [2]:
import spacy
import pandas as pd

pd.set_option('max_colwidth', 100)

nlp = spacy.load('ja_ginza')
sentence = '私は焼肉をよく食べる'
doc = nlp(sentence)
tokens = []
vectors = []
for token in doc:
    tokens.append(token)
    vectors.append(token.vector)

pd.DataFrame({
    'token': tokens,
    'vector': vectors
})

Unnamed: 0,token,vector
0,私,"[-0.13697676, -0.23937745, 0.045566633, -0.20059128, -0.08979624, 0.11869049, -0.03709601, -0.14..."
1,は,"[-0.05035316, -0.15731327, -0.08336552, -0.15989235, -0.12370043, -0.0015842685, -0.015121695, -..."
2,焼肉,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,を,"[-0.19509408, -0.13202968, -0.018012488, -0.12985665, -0.11748332, 0.16473995, -0.08152997, -0.0..."
4,よく,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,食べる,"[-0.13252193, -0.40151066, 0.04001627, -0.08204308, -0.013501916, 0.23513791, 0.2380793, -0.3093..."


In [3]:
vectors[0].shape

(300,)

## データの準備

In [4]:
!mkdir data
!wget http://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2 -O data/KNBC_v1.0_090925_utf8.tar.bz2

--2020-11-02 09:14:25--  http://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
Resolving nlp.ist.i.kyoto-u.ac.jp (nlp.ist.i.kyoto-u.ac.jp)... 133.242.249.182
Connecting to nlp.ist.i.kyoto-u.ac.jp (nlp.ist.i.kyoto-u.ac.jp)|133.242.249.182|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4968025 (4.7M) [application/x-bzip2]
Saving to: ‘data/KNBC_v1.0_090925_utf8.tar.bz2’


2020-11-02 09:14:27 (2.07 MB/s) - ‘data/KNBC_v1.0_090925_utf8.tar.bz2’ saved [4968025/4968025]



In [5]:
%cd data
!tar xvf KNBC_v1.0_090925_utf8.tar.bz2
%cd ..

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
KNBC_v1.0_090925_utf8/html/KN221_Keitai_1-1-12-01-morph.html
KNBC_v1.0_090925_utf8/html/KN012_Sports_2-1-31-01.html
KNBC_v1.0_090925_utf8/html/KN215_Kyoto_1-1-26-01.html
KNBC_v1.0_090925_utf8/html/KN213_Keitai_1-1-3-01-morph.html
KNBC_v1.0_090925_utf8/html/KN039_Kyoto_1-1-28-01-morph.html
KNBC_v1.0_090925_utf8/html/KN238_Gourmet_1-1-26-01-morph.html
KNBC_v1.0_090925_utf8/html/KN217_Kyoto_1-1-20-01.html
KNBC_v1.0_090925_utf8/html/KN242_Kyoto_2-1-20-01.html
KNBC_v1.0_090925_utf8/html/KN209_Keitai_1-1-56-01-morph.html
KNBC_v1.0_090925_utf8/html/KN255_Kyoto_1-1-8-01-morph.html
KNBC_v1.0_090925_utf8/html/KN258_Keitai_1-1-8-02-morph.html
KNBC_v1.0_090925_utf8/html/KN002_Keitai_1-1-2-01-morph.html
KNBC_v1.0_090925_utf8/html/KN025_Kyoto_1-1-7-01-morph.html
KNBC_v1.0_090925_utf8/html/KN243_Gourmet_1-1-7-01-morph.html
KNBC_v1.0_090925_utf8/html/KN205_Gourmet_2-1-4-01-morph.html
KNBC_v1.0_090925_utf8/html/KN003_Kyoto_1-1-11-01.html
KNBC_v1.0_090925_ut

In [6]:
import re
import pandas as pd
import numpy as np

def get_sentences_from_text(filename):
  sentences = []
  with open(filename, 'r') as f:
    for i, line in enumerate(f):
      sentence = line.split('\t')[1].strip()
      if sentence == '': # 空文字を除去。
        continue
      if re.match('^http.*$', sentence): # URLを除去。
        continue
      sentences.append(sentence)
  return sentences

In [7]:
import os

root_dir = 'data/KNBC_v1.0_090925_utf8/corpus2'
targets = ['Gourmet', 'Keitai', 'Kyoto', 'Sports']

original_data = []
for target in targets:
  filename = os.path.join(root_dir, f'{target}.tsv')
  sentences = get_sentences_from_text(filename)
  for sentence in sentences:
    original_data.append([target, sentence])

original_df = pd.DataFrame(original_data, columns=['target', 'sentence'])

In [8]:
display(original_df.head())
display(original_df.tail())
display(pd.DataFrame(original_df['target'].value_counts()))

Unnamed: 0,target,sentence
0,Gourmet,［グルメ］烏丸六角のおかき屋さん
1,Gourmet,六角堂の前にある、蕪村庵というお店に行ってきた。
2,Gourmet,おかきやせんべいの店なのだが、これがオイシイ。
3,Gourmet,のれんをくぐると小さな庭があり、その先に町屋風の店内がある。
4,Gourmet,せんべいの箱はデパートみたいな山積みではなく、間隔をあけて陳列されているのがまた良い。


Unnamed: 0,target,sentence
4181,Sports,筋力が違う！！
4182,Sports,なんか神様、不公平・・・
4183,Sports,男性諸君、このこと忘れないでやぁ（＞◆＜）
4184,Sports,まぁ。。。
4185,Sports,女はいろいろ強いし、怖いけどね笑


Unnamed: 0,target
Kyoto,1498
Keitai,1278
Gourmet,888
Sports,522


## LSTMによる分類タスク

### トレーニング用データに変換

In [9]:
import spacy
import numpy as np

nlp = spacy.load('ja_ginza')
target2index = pd.get_dummies(targets)

def get_features_and_labels_for_spacy(original_df):
  features = []
  labels = []
  max_feature_len = 0
  for i, original in enumerate(original_df.iterrows()):
    sentence = original[1]['sentence']
    target = original[1]['target']

    doc = nlp(sentence)
    feature = [token.vector for token in doc]
    max_feature_len = max(max_feature_len, len(feature))
    label = target2index[target].values

    features.append(feature)
    labels.append(label)

  return np.asarray(features), np.asarray(labels), max_feature_len

In [10]:
from sklearn.model_selection import train_test_split

features, labels, max_feature_len = get_features_and_labels_for_spacy(original_df)
print(max_feature_len)

134


In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

features = pad_sequences(features, maxlen=max_feature_len, dtype='float32') # dtypeの指定を忘れるとひどいことになるので注意。
(train_features, test_features, train_labels, test_labels) = train_test_split(features, labels, test_size=0.2)

print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)

(3348, 134, 300)
(3348, 4)
(838, 134, 300)
(838, 4)


### モデル構築

In [12]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

inputs = Input(shape=(max_feature_len, train_features.shape[2]))
x = LSTM(64)(inputs)
outputs = Dense(len(targets), activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'mse'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 134, 300)]        0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                93440     
_________________________________________________________________
dense (Dense)                (None, 4)                 260       
Total params: 93,700
Trainable params: 93,700
Non-trainable params: 0
_________________________________________________________________


### トレーニング実行

In [13]:
model.fit(train_features, train_labels, validation_split=0.1, verbose=1, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1a0b959fd0>

### クラシフィケーションレポート

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

predicted_test_labels = model.predict(test_features).argmax(axis=1)
numeric_test_labels = test_labels.argmax(axis=1)

report = classification_report(numeric_test_labels, predicted_test_labels, target_names=targets, output_dict=True)

display(pd.DataFrame(report).T)

Unnamed: 0,precision,recall,f1-score,support
Gourmet,0.762963,0.588571,0.664516,175.0
Keitai,0.723404,0.787645,0.754159,259.0
Kyoto,0.748201,0.695652,0.720971,299.0
Sports,0.517483,0.704762,0.596774,105.0
accuracy,0.702864,0.702864,0.702864,0.702864
macro avg,0.688013,0.694158,0.684105,838.0
weighted avg,0.714711,0.702864,0.703877,838.0


### 手動で評価

In [15]:
check_data = ('Kyoto', '金閣寺が見たいです。') # targetは何でも構いません。

check_df = pd.DataFrame([check_data], columns=['target', 'sentence'])
check_features = get_features_and_labels_for_spacy(check_df)[0]
check_features = pad_sequences(check_features, maxlen=max_feature_len, dtype='float32')
check_predict = model.predict(check_features)[0]

print(f'{targets[check_predict.argmax()]}: {round(check_predict[check_predict.argmax()] * 100, 1)}%')

Kyoto: 97.6%


## LSTMによる自然言語生成

### トレーニング用データに変換

In [55]:
input_len = 8

def get_features_and_labels_for_nlg(original_df):
    features = []
    labels = []
    for sentence in original_df['sentence']:
        doc = nlp(sentence)
        if len(doc) <= input_len:
            continue
        for i in range(len(doc)) :
            if (i + input_len) >= len(doc):
                break
            feature = [token.vector for token in doc[i:i + input_len]]
            label = doc[i + input_len]

            features.append(feature)
            labels.append(label.text)
    
    return np.array(features), labels

In [56]:
features, labels = get_features_and_labels_for_nlg(original_df)

In [57]:
token2index = {label: i for i, label in enumerate(set(labels))}
index2token = {i: token for token, i in token2index.items()}

In [58]:
from tensorflow.keras.utils import to_categorical

onehot_labels = to_categorical([token2index[label] for label in labels])

### モデル構築

In [59]:
from tensorflow.keras.layers import Input, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Model

inputs = Input(shape=(features.shape[1], features.shape[2]))
x = Bidirectional(LSTM(256))(inputs)
x = Dropout(0.1)(x)
outputs = Dense(onehot_labels.shape[1], activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', 'mse'])
model.summary()

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 8, 300)]          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 512)               1140736   
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 5553)              2848689   
Total params: 3,989,425
Trainable params: 3,989,425
Non-trainable params: 0
_________________________________________________________________


### トレーニング実行

In [60]:
model.fit(features, onehot_labels, validation_split=0.1, verbose=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f19d4eb8e10>

### 実験

In [61]:
def sample_with_temp(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    
    return np.argmax(probs)

In [91]:
seq = 20
test_sentence = 'この季節は京都に行って'
display_sentence = test_sentence

for i in range(seq):
    test_doc = nlp(test_sentence)[-input_len:]
    test_features = np.array([[token.vector for token in test_doc]])
    preds = model.predict(test_features)
    max_index = sample_with_temp(preds[0], 0.3)
    next_token = index2token[max_index]
    test_sentence += next_token
    display_sentence = display_sentence + '|' + next_token
    if next_token == '。':
        break

print(display_sentence)

この季節は京都に行って|くる|こと|で|ある|、|その|商売|っけ|が|神社|に|ある|だけ|に|余計|見苦しく|見える|。
