# 用RNN做情意分析

In [18]:
%env KERAS_BACKEND=tensorflow

env: KERAS_BACKEND=tensorflow


In [2]:
%matplotlib widget
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [63]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, History
from keras.datasets import imdb # 讀入 IMDB 電影數據庫
#fix loading imdb dataset problem: https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa

In [46]:
total_num_words = 10000 #自然語言處理領域的問題中決定要用到多少字
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=total_num_words)

### 送入神經網路的輸入處理

老師建議：雖然 RNN 是可以處理不同長度的輸入, 在寫程式時還是要

* 設輸入文字長度的上限
* 把每段文字都弄成一樣長, 太短的後面補上 0

In [53]:
#找出長度最長的影評
vectorized_len = np.vectorize(len)
length_of_each_comment = vectorized_len(x_train)
pad_until_length = round(np.amax(length_of_each_comment)/8,0).astype(int)

In [54]:
x_train_ped = sequence.pad_sequences(x_train, maxlen=pad_until_length)
x_test_ped = sequence.pad_sequences(x_test, maxlen=pad_until_length)

# 決定神經網路架構

* 將 10000 維的文字壓到 O 維 by "word embedding" ；避免用1-hot 處理 1 萬個字造成用 1 萬維的向量表示：浪費記憶空間
* 然後用 O 個 LSTM
* 最後一個 output, 直接用 sigmoid 送出

In [55]:
model = Sequential()
model.add(Embedding(total_num_words, 64)) #目標壓縮成多少維度
model.add(LSTM(50)) #幾個神經元
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 64)          640000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                23000     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 663,051
Trainable params: 663,051
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.fit(x_train_ped, y_train,
         batch_size=32,
         epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2092b4b3550>

## 檢視結果

### 預測情形

In [58]:
score = model.evaluate(x_test_ped, y_test)



In [25]:
print('測試資料的 loss', score[0])
print('測試資料的正確率', score[1])

測試資料的 loss 0.8484413851189614
測試資料的正確率 0.85104


In [64]:
plt.plot(model.history["acc"])
plt.title("training accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.show()

TypeError: 'History' object is not subscriptable

In [None]:
plt.plot(model.history["val_acc"],color="yellow")
plt.title("testing accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.show()

### 原始評論

In [37]:
word_to_id = imdb.get_word_index()
word_to_id={k:v for k,v in word_to_id.items()} #(v+3-1)
word_to_id["<PAD>"] = 0
#word_to_id["<START>"] = 1
#word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}

def translate_and_show_comment(comment_i, comments, dictionary):
    print(' '.join(dictionary[id] for id in comments[comment_i] ))

In [40]:
#translate_and_show_comment(10,x_test,id_to_word)
id_to_word[0]

'<PAD>'

## 儲存結果

In [19]:
modelfilename = 'wk10_RNN_model_for_imdb.h5'
#model.save(modelfilename)
from keras.models import load_model
model = load_model(modelfilename)