In [1]:
import tensorflow as tf

gpuConfig = tf.ConfigProto(
    gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=1.0),
    device_count={'GPU': 1})

sess = tf.Session(config=gpuConfig)

In [2]:
from keras.layers import Input, Flatten, Dense, Concatenate, Reshape, Conv1D, MaxPooling1D, Permute, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint, LambdaCallback, CSVLogger, LearningRateScheduler, ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD
import pandas as pd
import numpy as np
import random
import os
import math
from keras import backend as K
import csv
from random import choice
import pickle
import json
import zipfile
import keras
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn import preprocessing
import matplotlib.pyplot as plt
from janome.tokenizer import Tokenizer
import re
from gensim.models.word2vec import Word2Vec

K.clear_session()

Using TensorFlow backend.


In [3]:
word2vec_jp_model = Word2Vec.load('model/word2vec.gensim.model')

In [4]:
print(word2vec_jp_model.wv.similarity(u'宇多田ヒカル', u'浜崎あゆみ'), word2vec_jp_model.wv.similarity(u'宇多田ヒカル', u'ビートルズ'))

0.924207654927 0.646280544106


In [5]:
data = pd.read_csv('./data/intent_data_jp_ja.csv', sep=',', names=['text', 'intent'])

In [6]:
data.shape

(4207, 2)

In [7]:
data.head(3)

Unnamed: 0,text,intent
0,ラジオ日本聞きたい,JORF
1,ラジオ日本を聞かせて,JORF
2,ラジオ日本を再生,JORF


In [8]:
le = preprocessing.LabelEncoder()

data['label'] = le.fit_transform(data['intent'])

In [9]:
data.shape

(4207, 3)

In [10]:
data.head(3)

Unnamed: 0,text,intent,label
0,ラジオ日本聞きたい,JORF,5
1,ラジオ日本を聞かせて,JORF,5
2,ラジオ日本を再生,JORF,5


In [11]:
data = data.drop(['intent'], axis=1)

In [12]:
data.head(3)

Unnamed: 0,text,label
0,ラジオ日本聞きたい,5
1,ラジオ日本を聞かせて,5
2,ラジオ日本を再生,5


In [13]:
x_train_val, x_test, y_train_val, y_test, = train_test_split(data, 
                                                   to_categorical(data['label']), 
                                                   test_size = .2, 
                                                   random_state=12)

In [14]:
x_train_val.head(3)

Unnamed: 0,text,label
1053,ほうそうだいがくを聞きたい,3
176,ラジオにほんを再生して欲しいのですけど,5
4111,松任谷由実の動画を youtube 再生して,17


In [15]:
x_test.head(3)

Unnamed: 0,text,label
913,放送大学を再生してください,3
1387,nhk r 2が聞きたいと思います,19
667,ナックファイブごが聞きたいと思います,7


In [16]:
j_tokenizer = Tokenizer()

def wakati_reading(text):
    tokens = j_tokenizer.tokenize(text.replace("'", "").lower())
    
    exclude_pos = [u'助詞',u'助動詞']
    
    #分かち書き
    tokens_w_space = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
        
        if partOfSpeech not in exclude_pos:
            tokens_w_space = tokens_w_space + " " + token.surface

    tokens_w_space = tokens_w_space.strip()
    
    #読み方
    tokens_reading = ""
    for token in tokens:
        partOfSpeech = token.part_of_speech.split(',')[0]
 
        if partOfSpeech not in exclude_pos:
            if token.reading != "*":
                tokens_reading = tokens_reading + " " + token.reading
            elif re.match('^[a-z]+$', token.base_form):
                alpha_reading = ""
                alpha_reading = token.base_form.replace("a", "エー ")
                alpha_reading = alpha_reading.replace("b", "ビー ")
                alpha_reading = alpha_reading.replace("c", "シー ")
                alpha_reading = alpha_reading.replace("d", "ディー ")
                alpha_reading = alpha_reading.replace("e", "イー ")
                alpha_reading = alpha_reading.replace("f", "エフ ")
                alpha_reading = alpha_reading.replace("g", "ジー ")
                alpha_reading = alpha_reading.replace("h", "エイチ ")
                alpha_reading = alpha_reading.replace("i", "アイ ")
                alpha_reading = alpha_reading.replace("j", "ジェー ")
                alpha_reading = alpha_reading.replace("k", "ケー ")
                alpha_reading = alpha_reading.replace("l", "エル ")
                alpha_reading = alpha_reading.replace("m", "エム ")
                alpha_reading = alpha_reading.replace("n", "エヌ ")
                alpha_reading = alpha_reading.replace("o", "オー ")
                alpha_reading = alpha_reading.replace("p", "ピー ")
                alpha_reading = alpha_reading.replace("q", "キュー ")
                alpha_reading = alpha_reading.replace("r", "アール ")
                alpha_reading = alpha_reading.replace("s", "エス ")
                alpha_reading = alpha_reading.replace("t", "ティー ")
                alpha_reading = alpha_reading.replace("u", "ユー ")
                alpha_reading = alpha_reading.replace("v", "ブイ ")
                alpha_reading = alpha_reading.replace("w", "ダブリュー ")
                alpha_reading = alpha_reading.replace("x", "エックス ")
                alpha_reading = alpha_reading.replace("y", "ワイ ")
                alpha_reading = alpha_reading.replace("z", "ゼット ")

                tokens_reading = tokens_reading + " " + alpha_reading
            elif re.match('^[0-9]+$', token.base_form):
                numeric_reading = ""
                numeric_reading = token.base_form.replace("0", "ゼロ ")
                numeric_reading = numeric_reading.replace("1", "イチ ")
                numeric_reading = numeric_reading.replace("2", "ニ ")
                numeric_reading = numeric_reading.replace("3", "サン ")
                numeric_reading = numeric_reading.replace("4", "ヨン ")
                numeric_reading = numeric_reading.replace("5", "ゴ ")
                numeric_reading = numeric_reading.replace("6", "ロク ")
                numeric_reading = numeric_reading.replace("7", "ナナ ")
                numeric_reading = numeric_reading.replace("8", "ハチ ")
                numeric_reading = numeric_reading.replace("9", "キュー ")

                tokens_reading = tokens_reading + " " + numeric_reading.strip()

    tokens_reading = tokens_reading.strip()
    
    feature = tokens_w_space + " " + tokens_reading
    
    return feature

In [17]:
x_train_val['feature'] = x_train_val['text'].apply(lambda x: wakati_reading(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [18]:
x_train_val.head(5)

Unnamed: 0,text,label,feature
1053,ほうそうだいがくを聞きたい,3,ほう そう だい がく 聞き ホウ ソウ ダイ ガク キキ
176,ラジオにほんを再生して欲しいのですけど,5,ラジオ ほん 再生 し 欲しい の ラジオ ホン サイセイ シ ホシイ ノ
4111,松任谷由実の動画を youtube 再生して,17,松任谷 由実 動画 youtube 再生 し マツトウヤ ユミ ドウガ ワイ オー ...
3627,それじゃあ さようなら,12,それ さようなら ソレ サヨウナラ
1662,fm nhkを再生して欲しいんですけど,20,fm nhk 再生 し 欲しい ん エフ エム エヌ エイチ ケー サイセイ シ ホ...


In [19]:
feature = x_train_val[['feature']].values.flatten()

MAX_SEQUENCE_LENGTH = 2000 # Maximum number of words in a unique BI claims doc

tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(feature)
word_sequences = tokenizer.texts_to_sequences(feature)

word_index = tokenizer.word_index
print('Found {num} unique words.'.format(num=len(word_index)))

padded_word_inputs = pad_sequences(word_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 425 unique words.


In [20]:
padded_train_pd = pd.DataFrame(padded_word_inputs)
padded_train_pd.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,73,75,121,122,1,74,76,33,123,2
1,0,0,0,0,0,0,0,0,0,0,...,7,3,20,11,6,78,8,4,21,12
2,0,0,0,0,0,0,0,0,0,0,...,116,43,48,64,23,64,26,29,8,4


In [21]:
x_train, x_val, y_train, y_val, = train_test_split(padded_train_pd, 
                                                   to_categorical(x_train_val['label']), 
                                                   test_size = .2, 
                                                   random_state=12)

In [22]:
x_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
2394,0,0,0,0,0,0,0,0,0,0,...,18,46,1,14,5,17,13,34,2,15
2944,0,0,0,0,0,0,0,0,0,0,...,3,20,6,31,22,5,34,8,4,21
1387,0,0,0,0,0,0,0,0,0,0,...,11,6,31,22,5,34,8,4,21,12


In [23]:
x_train.shape

(2692, 2000)

In [24]:
x_val.shape

(673, 2000)

In [25]:
x_test.shape

(842, 2)

In [26]:
embedding_layer = word2vec_jp_model.wv.get_keras_embedding(train_embeddings=True)

In [None]:
batch_size = 4

convs = []
filter_sizes = [3,4,5]

#Based on https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=128,filter_length=fsz,activation='relu')(embedded_sequences)
    l_conv = Dropout(.25)(l_conv)
    l_pool = MaxPooling1D(5)(l_conv)
    l_pool = Dropout(.25)(l_pool)
    convs.append(l_pool)

l_merge = Concatenate()(convs)
l_merge = Dropout(.25)(l_merge)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
l_cov1 = Dropout(.25)(l_cov1)
l_pool1 = MaxPooling1D(5)(l_cov1)
l_pool1 = Dropout(.25)(l_pool1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_cov2 = Dropout(.25)(l_cov2)
l_pool2 = MaxPooling1D(30)(l_cov2)
l_pool2 = Dropout(.25)(l_pool2)
l_flat = Flatten()(l_pool2)
l_flat = Dropout(.25)(l_flat)
l_dense = Dense(128, activation='relu')(l_flat)
l_dense = Dropout(.25)(l_dense)
preds = Dense(21, activation='softmax')(l_dense)

model = Model(sequence_input, outputs=preds)
opt = SGD(lr = 0.01, momentum = 0.9)

model.compile(optimizer = opt, loss='binary_crossentropy', metrics=['acc'])

checkpointer = ModelCheckpoint(filepath = './model/model_v1.{epoch:02d}-{val_loss:.2f}.hdf5', verbose = 1, save_best_only = True)
csv_logger = CSVLogger('./model/model_v1.log')

reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2,
                  patience = 5, min_lr = 0.001)

class_weights = class_weight.compute_class_weight('balanced', np.unique(data['label']), data['label'])



In [None]:
history = model.fit(x=x_train.values, y=y_train, validation_data=(x_val.values, y_val),
          epochs=30, batch_size=batch_size, verbose = 0,
          callbacks = [reduce_lr, csv_logger, checkpointer])

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, label='acc')
plt.plot(epochs, val_acc, label='val_acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

plt.plot(epochs, loss, label='loss')
plt.plot(epochs, val_loss, label='val_loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj, output, protocol=2)

save_object(tokenizer, './model/word_tokenizer.pkl')

In [None]:
from keras.models import load_model
import numpy as np
import pandas as pd
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import average_precision_score

model = load_model(filepath='./model/model_v1.26-0.19.hdf5')

In [None]:
def load_obj(filename):
    with open(filename, 'rb') as handler:
        return pickle.load(handler)

tokenizer = load_obj('./model/word_tokenizer.pkl')

In [None]:
x_test['feature'] = x_test['text'].apply(lambda x: wakati_reading(x))

feature = x_test[['feature']].values.flatten()
MAX_SEQUENCE_LENGTH = 2000
test_word_sequences = tokenizer.texts_to_sequences(feature)
padded_test_word_inputs = pad_sequences(test_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)

x_test.head(3)

In [None]:
test_predictions = model.predict(padded_test_word_inputs)
test_pred_pd = pd.DataFrame(test_predictions)
x_test = x_test.reset_index(drop=True)

test_result_pd = pd.concat([x_test, test_pred_pd], axis=1)
test_result_pd.columns = ['text', 'label', 'feature', 'prob_1', 'prob_2', 'prob_2', 'prob_3', 'prob_4', 'prob_5', 'prob_6', 'prob_7', 'prob_8', 'prob_9', 'prob_10', 'prob_11', 'prob_12', 'prob_13', 'prob_14', 'prob_15', 'prob_16', 'prob_17', 'prob_18', 'prob_19', 'prob_20']
test_result_pd.head(300)