In [1]:
from utils import *
from models import *
import pandas as pd
from tqdm import tqdm_notebook
from keras.utils import to_categorical
import numpy as np

import re
import scipy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD

Using TensorFlow backend.


In [2]:
classify_dict = {'本会議':0, '厚生労働委員会':1, '国土交通委員会':2, '予算委員会':3}

In [3]:
df_train = pd.read_csv('./train.csv', header=0, sep='\t')
df_test  = pd.read_csv('./test.csv', header=0, sep='\t')
display(df_train.head())
print('train_size: %i' % len(df_train))
display(df_test.head())
print('test_size: %i' % len(df_test))

Unnamed: 0,houses,committee,speaker,date,speech_order,speech_text
0,衆議院,厚生労働委員会,丹羽秀樹,20161021,1,これより会議を開きます。厚生労働関係の基本施策に関する件について調査を進めます。この際、お諮...
1,衆議院,厚生労働委員会,丹羽秀樹,20161021,2,御異議なしと認めます。よって、そのように決しました。
2,衆議院,厚生労働委員会,丹羽秀樹,20161021,3,質疑の申し出がありますので、順次これを許します。田村憲久君。
3,衆議院,厚生労働委員会,田村憲久,20161021,4,おはようございます。自民党の田村憲久でございます。きょうは、大臣への質疑、所信に対する質疑と...
4,衆議院,厚生労働委員会,鈴木俊彦,20161021,5,お答え申し上げます。御存じのように、年金は、将来年金を受給いたします現在の若い方たちが現在年...


train_size: 71529


Unnamed: 0,houses,committee,speaker,date,speech_order,speech_text
0,衆議院,厚生労働委員会,高鳥修一,20180427,1,これより会議を開きます。開会に先立ちまして、立憲民主党・市民クラブ、希望の党・無所属クラブ、...
1,衆議院,厚生労働委員会,高鳥修一,20180427,2,速記を起こしてください。理事をして再度御出席を要請させましたが、立憲民主党・市民クラブ、希望...
2,衆議院,厚生労働委員会,高鳥修一,20180427,3,御異議なしと認めます。よって、そのように決しました。
3,衆議院,厚生労働委員会,高鳥修一,20180427,4,質疑の申出がありますので、順次これを許します。船橋利実君。
4,衆議院,厚生労働委員会,船橋利実,20180427,5,おはようございます。自由民主党の船橋利実でございます。二期目復帰をさせていただきまして、初め...


test_size: 45266


# 前処理を実施する
mecabを用いた分かち書きを実施する。

In [None]:
ma = Mecab_Analysis(dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd')

In [None]:
#名詞、動詞、形容詞以外を外すために使用するタプル。
inflection_accept = ('名詞', '動詞', '形容詞')
train_text = df_train['speech_text'].values.tolist()
test_text  = df_test['speech_text'].values.tolist()

df_train_ma = pd.DataFrame([' '.join([y[-3] if y[1] in inflection_tapple else y[0] for y in ma.Morphological_Analysis(x) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(train_text)],\
                            columns=['speech_text_ma'])
df_test_ma  = pd.DataFrame([' '.join([y[-3] if y[1] in inflection_tapple else y[0] for y in ma.Morphological_Analysis(x) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(test_text)],\
                           columns=['speech_text_ma'])

df_train = pd.concat((df_train, df_train_ma), axis=1)
df_test = pd.concat((df_test, df_test_ma), axis=1)

display(df_train.head())
display(df_test.head())
del df_train_ma
del df_test_ma

In [None]:
x_train_text = df_train['speech_text_ma'].values.tolist()
x_test_text  = df_test['speech_text_ma'].values.tolist()

y_train = [classify_dict[x] for x in df_train['committee'].values.tolist()]
y_test  = [classify_dict[x] for x in df_test['committee'].values.tolist()]

In [None]:
corpus = x_train_text + x_test_text
train_size= len(x_train_text)

cv = CountVectorizer()
wc = cv.fit_transform(corpus)
ttf = TfidfTransformer()
tfidf = ttf.fit_transform(wc)
print(tfidf.shape)

In [None]:
svd = TruncatedSVD(n_components=4096, n_iter=3)
tfidf_svd = svd.fit_transform(tfidf)

x_train = tfidf_svd[:train_size,:]
x_test = tfidf_svd[train_size:,:]
print(x_train.shape)
print(x_test.shape)
print(np.sum(svd.explained_variance_ratio_))

In [None]:
#ニューラルネット 用にone-hotベクトルを作成する。
y_labels = y_train + y_test
y_labels_one_hot = to_categorical(y_labels)
y_train = y_labels_one_hot[:train_size]
y_test = y_labels_one_hot[train_size:]
del y_labels
del y_labels_one_hot

In [None]:
train_len = int(x_train.shape[0] * 0.8)
x_train_nn = x_train[:train_len]
y_train_nn = y_train[:train_len]
x_valid_nn = x_train[train_len:]
y_valid_nn = y_train[train_len:]

In [None]:
sn = Simplenn(x_train.shape[1], len(classify_dict), hidden_size=[512,256], batch_size=32, \
                 learning_rate=0.001, epochs=5, dropuout=0.2)
train_len = int(x_train.shape[0] * 0.8)

sn.train(x_train_nn, y_train_nn, x_valid_nn, y_valid_nn)
sn.test(x_test, y_test)

# 以下の条件を設ける

1. 苗字と名前の間などに全角スペースが入っているケースがあるので全角スペースを取り除く  
2. 数字や人名を適当なものに置き換える。  
→数字は全て1に置き換える。人名については一律「佐村河内守」に置き換える。  
3. 日付については意味をなさないので全て適当なものに置き換える。  
→日付については「1年1月1日」に置き換える。 
4. stopwaordを設ける

In [None]:
ma = Mecab_Analysis(dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
df_train, df_test = dalete_text_ma_column(df_train, df_test)

inflection_tapple = ('動詞', '形容詞')
#名詞、動詞、形容詞以外を外すために使用するタプル。
inflection_accept = ('名詞', '動詞', '形容詞')
train_text = df_train['speech_text'].values.tolist()
test_text  = df_test['speech_text'].values.tolist()
#全角スペースを削除した状態で形態素解析をする
df_train_ma = pd.DataFrame([' '.join([translate_word(y) for y in ma.Morphological_Analysis(re.sub(r'　','',x)) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(train_text)],\
                            columns=['speech_text_ma'])
df_test_ma  = pd.DataFrame([' '.join([translate_word(y) for y in ma.Morphological_Analysis(re.sub(r'　','',x)) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(test_text)],\
                           columns=['speech_text_ma'])

df_train = pd.concat((df_train, df_train_ma), axis=1)
df_test = pd.concat((df_test, df_test_ma), axis=1)

display(df_train.head())
display(df_test.head())
del df_train_ma
del df_test_ma

In [None]:
x_train_text = df_train['speech_text_ma'].values.tolist()
x_test_text  = df_test['speech_text_ma'].values.tolist()

y_train = [classify_dict[x] for x in df_train['committee'].values.tolist()]
y_test  = [classify_dict[x] for x in df_test['committee'].values.tolist()]

In [None]:
corpus = x_train_text + x_test_text
train_size= len(x_train_text)

cv = CountVectorizer(max_df=0.5, min_df=3)
wc = cv.fit_transform(corpus)
ttf = TfidfTransformer()
tfidf = ttf.fit_transform(wc)
print(tfidf.shape)

In [None]:
svd = TruncatedSVD(n_components=4096, n_iter=3)
tfidf_svd = svd.fit_transform(tfidf)

x_train = tfidf_svd[:train_size,:]
x_test = tfidf_svd[train_size:,:]
print(x_train.shape)
print(x_test.shape)
print(np.sum(svd.explained_variance_ratio_))

In [None]:
#ニューラルネット 用にone-hotベクトルを作成する。
y_labels = y_train + y_test
y_labels_one_hot = to_categorical(y_labels)
y_train = y_labels_one_hot[:train_size]
y_test = y_labels_one_hot[train_size:]
del y_labels
del y_labels_one_hot

In [None]:
train_len = int(x_train.shape[0] * 0.8)
x_train_nn = x_train[:train_len]
y_train_nn = y_train[:train_len]
x_valid_nn = x_train[train_len:]
y_valid_nn = y_train[train_len:]

In [None]:
sn = Simplenn(x_train.shape[1], len(classify_dict), hidden_size=[512,256], batch_size=32, \
                 learning_rate=0.001, epochs=5, dropuout=0.2)
train_len = int(x_train.shape[0] * 0.8)

sn.train(x_train_nn, y_train_nn, x_valid_nn, y_valid_nn)
sn.test(x_test, y_test)

# 前後の文脈も考慮する

In [None]:
ma = Mecab_Analysis(dic_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd')
df_train, df_test = dalete_text_ma_column(df_train, df_test)

inflection_tapple = ('動詞', '形容詞')
#名詞、動詞、形容詞以外を外すために使用するタプル。
inflection_accept = ('名詞', '動詞', '形容詞')
train_text = df_train['speech_text'].values.tolist()
test_text  = df_test['speech_text'].values.tolist()
#全角スペースを削除した状態で形態素解析をする
df_train_ma = pd.DataFrame([' '.join([translate_word(y) for y in ma.Morphological_Analysis(re.sub(r'　','',x)) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(train_text)],\
                            columns=['speech_text_ma'])
df_test_ma  = pd.DataFrame([' '.join([translate_word(y) for y in ma.Morphological_Analysis(re.sub(r'　','',x)) if y[1] in inflection_accept]) \
                            for x in  tqdm_notebook(test_text)],\
                           columns=['speech_text_ma'])

df_train = pd.concat((df_train, df_train_ma), axis=1)
df_test = pd.concat((df_test, df_test_ma), axis=1)

display(df_train.head())
display(df_test.head())
del df_train_ma
del df_test_ma

In [None]:
chunk_size = 15
x_train_text, y_train = create_chunk_dataset(df_train, chunk_size=chunk_size)
x_test_text, y_test = create_chunk_dataset(df_test, chunk_size=chunk_size)

In [None]:
corpus = x_train_text + x_test_text
train_size= len(x_train_text)

cv = CountVectorizer(max_df=0.5, min_df=chunk_size*2+1)
wc = cv.fit_transform(corpus)
ttf = TfidfTransformer()
tfidf = ttf.fit_transform(wc)
print(tfidf.shape)

In [None]:
x_test_text[2]

In [None]:
svd = TruncatedSVD(n_components=4096, n_iter=3)
tfidf_svd = svd.fit_transform(tfidf)

x_train = tfidf_svd[:train_size,:]
x_test = tfidf_svd[train_size:,:]
print(x_train.shape)
print(x_test.shape)
print(np.sum(svd.explained_variance_ratio_))

In [None]:
#ニューラルネット 用にone-hotベクトルを作成する。
y_train = [classify_dict[x] for x in df_train['committee'].values.tolist()]
y_test  = [classify_dict[x] for x in df_test['committee'].values.tolist()]
y_labels = y_train + y_test
y_labels_one_hot = to_categorical(y_labels)
y_train = y_labels_one_hot[:train_size]
y_test = y_labels_one_hot[train_size:]
del y_labels
del y_labels_one_hot

In [None]:
train_len = int(x_train.shape[0] * 0.8)
x_train_nn = x_train[:train_len]
y_train_nn = y_train[:train_len]
x_valid_nn = x_train[train_len:]
y_valid_nn = y_train[train_len:]

In [None]:
sn = Simplenn(x_train.shape[1], len(classify_dict), hidden_size=[512,256], batch_size=32, \
                 learning_rate=0.001, epochs=5, dropuout=0.9)
train_len = int(x_train.shape[0] * 0.8)

sn.train(x_train_nn, y_train_nn, x_valid_nn, y_valid_nn)
sn.test(x_test, y_test)