In [1]:
import pandas as pd
import string
import re
import tensorflow as tf
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

In [2]:
amazon_reviews_df = pd.read_csv('amazon_reviews_multilingual_JP_v1_00.tsv', sep='\t')

In [3]:
amazon_reviews_df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [4]:
mapping = {1:0, 2:0, 4:1, 5:1}
amazon_reviews_star1245_df = amazon_reviews_df[amazon_reviews_df.star_rating != 3]
amazon_reviews_star1245_df.star_rating = amazon_reviews_star1245_df.star_rating.map(mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [5]:
amazon_reviews_star1245_df[['star_rating','review_body']]

Unnamed: 0,star_rating,review_body
0,0,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…
1,0,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…
2,1,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。
3,1,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...
4,1,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...
...,...,...
262251,0,とても面白いゲームなのに、一度でも終了すると二度と起動出来なくなる。<br />再インストー...
262252,1,ゆっくり時間を掛けてあれこれ進路を考えながら配置するのは結構楽しい作業。<br />なかなか...
262253,1,A very candid analisys as to how the WWI start...
262254,1,Could hardly put this novel down. Was sorry it...


In [6]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold

In [7]:
is_jp = amazon_reviews_star1245_df.review_body.apply(filter_by_ascii_rate)
amazon_reviews_star1245_df = amazon_reviews_star1245_df[is_jp]

In [8]:
amazon_reviews_star1245_df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,0,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,0,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,1,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,1,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,1,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262249,JP,49068132,R1TFY1E6ZW2T5N,B000002IOJ,377805808,Practice What You Preach,Music,1,4,33,N,N,メントならこれから,通称「メント」。ポストメタリカの3Mの一角を担うバンド。#1.Practice What Y...,2003-01-13
262250,JP,49068171,R19UWXLR290D3X,B008RBFBZS,819159265,バルセロナ(スペシャル・エディション)(DVD付),Music,1,0,0,N,Y,やっぱりフレディはスゴイ,久しぶりにフレディを聞きたくなり購入。さまざまなご意見はありますが、ロックとオペラの融合、そ...,2014-11-24
262251,JP,49068176,R2M96OZ1BUXN0Q,B007VGXS8M,428248709,タイムトラベラーウォーリーをおえ！™ (Kindle Tablet Edition),Mobile_Apps,0,5,6,N,Y,起動できなくなる,とても面白いゲームなのに、一度でも終了すると二度と起動出来なくなる。<br />再インストー...,2013-07-14
262252,JP,49068176,R2OEBHIJOW83UZ,B00AB7IA1S,132300013,Kings Can Fly,Mobile_Apps,1,1,1,N,Y,パズル好きにはいい,ゆっくり時間を掛けてあれこれ進路を考えながら配置するのは結構楽しい作業。<br />なかなか...,2013-07-14


In [9]:
grouped = amazon_reviews_star1245_df.groupby('star_rating')

In [10]:
amazon_reviews_star1245_df = grouped.head(500)

In [11]:
Y = amazon_reviews_star1245_df.star_rating.values

In [12]:
X_raw = amazon_reviews_star1245_df.review_body.values

In [42]:
amazon_reviews_star1245_df

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,65317,R33RSUD4ZTRKT7,B000001GBJ,957145596,SONGS FROM A SECRET GARDE,Music,0,1,15,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。ケルト音楽の範疇にも幅があるのですね…,2012-12-05
1,JP,65317,R2U1VB8GPZBBEH,B000YPWBQ2,904244932,鏡の中の鏡‾ペルト作品集(SACD)(Arvo Part:Spiegel im Spiegel),Music,0,4,20,N,Y,残念ながら…,残念ながら…趣味ではありませんでした。正直退屈…眠気も起きない…,2012-12-05
2,JP,65696,R1IBRCJPPGWVJW,B0002E5O9G,108978277,Les Miserables 10th Anniversary Concert,Music,1,2,3,N,Y,ドリームキャスト,素晴らしいパフォーマンス。ミュージカル映画版の物足りない歌唱とは違います。,2013-03-02
3,JP,67162,RL02CW5XLYONU,B00004SRJ5,606528497,It Takes a Nation of Millions to Hold Us Back,Music,1,6,9,N,Y,やっぱりマスト,専門的な事を言わずにお勧めレコメを書きたいのですが、文才が無いので無理でした。ヒップホップが...,2013-08-11
4,JP,67701,R2LA2SS3HU3A3L,B0093H8H8I,509738390,Intel CPU Core I3-3225 3.3GHz 3MBキャッシュ LGA1155...,PC,1,2,4,N,Y,コスパ的には十分,今までの環境（Core2 Duo E4600)に比べれば十分に快適になりました。<br />...,2013-02-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5356,JP,8003845,R2EVH4JYYPZR0V,B00942S4OY,994624246,TAKE ME HOME,Music,0,1,18,N,Y,好きなファンには悪いけど…,聴くに耐えない!!アルバム3枚全部共に!!歌、バックの楽器、全て完成度の低さに耐えられない。,2014-04-25
5357,JP,8003845,R37TNQSJ2TLX00,B00F0N61AU,20401760,Midnight Memories,Music,0,7,38,N,Y,うるさくて、耳障りな音楽!!,歌は下手、バックの楽器もうるさい!お子供の音楽ですね。ジャニーズと変わらない!良いところが全...,2014-04-25
5358,JP,8003845,RYE96NS1DPU8M,B00BCZUAE8,552668905,The 20/20 Experience (Deluxe Version),Music,0,3,23,N,Y,かなりのパクリですね。,フランク・ザッパを知らない若い方が多いと思いますが、ザッパの手法を真似ています。手法を取り入...,2014-06-09
5359,JP,8003845,R37Q97GW1P4GGS,B00E1LQ7Q0,338299933,The 20/20 Experience,Music,0,0,5,N,N,1よりは良いけど…,フランク・ザッパ色が少しなくなっているので、まだ良いのかも!?<br />自分の才能を見せつ...,2014-06-09


In [14]:
def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text

In [15]:
t = Tokenizer(wakati=True)

In [16]:
def tokenize(text):
    return t.tokenize(text)

In [17]:
def preprocess_dataset(texts):
    texts = [clean_html(text) for text in texts]
    texts = [' '.join(tokenize(text)) for text in texts]
    return texts

In [18]:
X_wakati = preprocess_dataset(X_raw)

In [39]:
len(X_wakati)

1000

In [20]:
x_train, x_test, y_train, y_test = train_test_split(X_wakati, Y,
                                                    test_size=0.2,
                                                    random_state=42)

In [21]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = 4000, oov_token = '<UNK>')
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)

In [22]:
x_test = tokenizer.texts_to_sequences(x_test)

In [45]:
x_train = pad_sequences(x_train, maxlen = 300, truncating='post')
x_train_2 = x_train.copy()

In [24]:
x_test = pad_sequences(x_test, maxlen = 300, truncating='post')

In [52]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM, concatenate
input_dim = 40000
output_dim = 1
emb_dim=300
hid_dim=100
input_unit = Input(shape=(None,), name='input')
embedding = Embedding(input_dim = input_dim,
                      output_dim = emb_dim,
                      mask_zero = True,
                      trainable = True,
                      name = 'embedding')(input_unit)
lstm = LSTM(hid_dim,
            dropout = 0.2,
            recurrent_dropout = 0.4,
            activation = 'tanh',
            name = 'lstm')(embedding)

input_unit_2 = Input(shape=(None,), name='input_2')
embedding_2 = Embedding(input_dim = input_dim,
                      output_dim = emb_dim,
                      mask_zero = True,
                      trainable = True,
                      name = 'embedding_2')(input_unit_2)
lstm_2 = LSTM(hid_dim,
            dropout = 0.2,
            recurrent_dropout = 0.4,
            activation = 'tanh',
            name = 'lstm_2')(embedding_2)
concatenated = concatenate([lstm, lstm_2], axis = -1)
output_unit = Dense(output_dim, activation = 'sigmoid')(concatenated)

model = Model(inputs = [input_unit, input_unit_2], outputs = output_unit)
    

In [53]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    12000000    input[0][0]                      
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    12000000    input_2[0][0]                    
____________________________________________________________________________________________

In [54]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [55]:
callbacks = [
    EarlyStopping(patience=3),
]

In [56]:
model.fit(x = [x_train, x_train_2],
          y = y_train,
          batch_size = 128,
          epochs = 100,
          validation_split = 0.2,
          callbacks = callbacks,
          shuffle = True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<tensorflow.python.keras.callbacks.History at 0x7f588c4aa700>

In [29]:
x_test

array([[  0,   0,   0, ...,   1, 148,  17],
       [  0,   0,   0, ...,  47,  20,  57],
       [  0,   0,   0, ...,  15, 185,   3],
       ...,
       [  0,   0,   0, ..., 721,  14,   3],
       [  0,   0,   0, ...,  15,   1,  35],
       [  0,   0,   0, ...,   1, 275,   3]], dtype=int32)

In [30]:
y_pred = model.predict(x_test)

In [31]:
import numpy as  np
y_pred = np.argmax(y_pred, 1)

In [32]:
y_test.shape

(200,)

In [33]:
y_pred.shape

(200,)

In [34]:
print('precision: {:.4f}'.format(precision_score(y_test, y_pred, average='binary')))
print('recall   : {:.4f}'.format(recall_score(y_test, y_pred, average='binary')))
print('f1       : {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))

precision: 0.8400
recall   : 0.6495
f1       : 0.7326


In [35]:
text = "言うほど悪くないと思うけど"
text_wakati = [' '.join(t.tokenize(text))]
text_labeling = tokenizer.texts_to_sequences(text_wakati)
text_padding = pad_sequences(text_labeling, maxlen = 300, truncating='post')
print(text_wakati)
print(text_labeling)
text_pred = model.predict(text_padding)
print(text_pred)

['言う ほど 悪く ない と 思う けど']
[[197, 176, 530, 17, 11, 132, 76]]
[[0.58022195 0.4539495 ]]


In [36]:
x_train.shape

(800, 300)

In [37]:
x_test.shape

(200, 300)

In [43]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         12000000  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 2)                 202       
Total params: 12,160,602
Trainable params: 12,160,602
Non-trainable params: 0
_________________________________________________________________


In [44]:
y_train.shape

(800,)