# 用RNN做情意分析

In [18]:
%env KERAS_BACKEND=tensorflow

env: KERAS_BACKEND=tensorflow


In [2]:
%matplotlib widget
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.datasets import imdb # 讀入 IMDB 電影數據庫
#fix loading imdb dataset problem: https://stackoverflow.com/questions/55890813/how-to-fix-object-arrays-cannot-be-loaded-when-allow-pickle-false-for-imdb-loa

In [21]:
total_num_words = 10000 #自然語言處理領域的問題中決定要用到多少字
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=total_num_words)

### 送入神經網路的輸入處理

老師建議：雖然 RNN 是可以處理不同長度的輸入, 在寫程式時還是要

* 設輸入文字長度的上限
* 把每段文字都弄成一樣長, 太短的後面補上 0

In [22]:
#找出長度最長的影評
vectorized_len = np.vectorize(len)
length_of_each_comment = vectorized_len(x_train)
pad_until_length = np.amax(length_of_each_comment)

In [23]:
x_train = sequence.pad_sequences(x_train, maxlen=pad_until_length)
x_test = sequence.pad_sequences(x_test, maxlen=pad_until_length)

# 決定神經網路架構

* 將 10000 維的文字壓到 O 維 by "word embedding" ；避免用1-hot 處理 1 萬個字造成用 1 萬維的向量表示：浪費記憶空間
* 然後用 O 個 LSTM
* 最後一個 output, 直接用 sigmoid 送出

In [9]:
model = Sequential()
model.add(Embedding(total_num_words, 128)) #目標壓縮成多少維度
model.add(LSTM(150)) #幾個神經元
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',
            optimizer='adam',
            metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               167400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 151       
Total params: 1,447,551
Trainable params: 1,447,551
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(x_train, y_train,
         batch_size=32,
         epochs=5)

Instructions for updating:
Use tf.cast instead.
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x208cefd3048>

## 檢視結果

### 預測情形

In [24]:
score = model.evaluate(x_test, y_test)



In [25]:
print('測試資料的 loss', score[0])
print('測試資料的正確率', score[1])

測試資料的 loss 0.8484413851189614
測試資料的正確率 0.85104


### 原始評論

In [None]:
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id={k:(v+INDEX_FROM-1) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}

def translate_and_show_comment(comment_i, comments, dictionary):
    print(' '.join(dictionary[id] for id in comments[comment_i] ))

## 儲存結果

In [19]:
modelfilename = 'wk10_RNN_model_for_imdb.h5'
#model.save(modelfilename)
from keras.models import load_model
model = load_model(modelfilename)