# Embedding 
- [IMDb Movie Review Dataset 影評分析](http://ai.stanford.edu/~amaas/data/sentiment/)
- 有時間關係的特徵抓取，建議使用 RNN

## step0: 載入資料並觀察

In [0]:
%tensorflow_version 2.x

In [0]:
# 下載資料並解壓縮檔案
# import os

# if not os.path.exists(filename):
#     print("資料集不存在，下載中...")
#     !wget --content-disposition 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
#     !tar zxvf aclImdb_v1.tar.gz
# else:
#     print("資料集已經下載過了:)")

In [0]:
# 透過 tensorflow，下載資料並解壓縮檔案
import os 
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True)

In [4]:
# 查看 dataset path
import glob

dataset_path = os.path.dirname(dataset)
print('資料集的路徑是：', dataset_path)

# 查看路徑內有什麼資料夾或檔案
glob.glob('/root/.keras/datasets/*')

資料集的路徑是： /root/.keras/datasets


['/root/.keras/datasets/aclImdb.tar.gz', '/root/.keras/datasets/aclImdb']

In [0]:
import pandas as pd

def get_data(dataset_path, filename):
    contents = []
    sentiment = []
    file_path = os.path.join(dataset_path, filename)

    pos_fn = os.path.join(file_path, "pos", "*.txt")
    for fn in glob.glob(pos_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(1)

    neg_fn = os.path.join(file_path, "neg", "*.txt")
    for fn in glob.glob(neg_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(0)

    df = pd.DataFrame({
        "content":contents,
        "sentiment":sentiment
    }, columns=["content", "sentiment"])

    return df 

In [6]:
dataset_path = os.path.dirname(dataset)
dataset_path = os.path.join(dataset_path, "aclImdb")
dataset_path

'/root/.keras/datasets/aclImdb'

In [7]:
train_df = get_data(dataset_path, "train")
train_df

Unnamed: 0,content,sentiment
0,I've just watched Roll and what a pleasure it ...,1
1,When Philo Vance (Edmund Lowe) is standing pre...,1
2,Henri Verneuil represented the commercial cine...,1
3,Another one of those films you hear about from...,1
4,I was totally engrossed in this film from the ...,1
...,...,...
24995,This is by the far worst piece of cr4p I've ev...,0
24996,What is the most harrowing movie ever made? Th...,0
24997,"We have all been asking ourselves ""why don't t...",0
24998,"I watched 'Envy' two nights ago, on DVD, at a ...",0


In [8]:
test_df = get_data(dataset_path, "test")
test_df

Unnamed: 0,content,sentiment
0,This movie gives you more of an idiea how Aust...,1
1,I was recently at a sleepover birthday party w...,1
2,I just got this movie for Christmas and have a...,1
3,Although the concept of a 32 year old woman po...,1
4,When I first saw Stella on comedy central I th...,1
...,...,...
24995,This movie is pretty predictable nuff said.......,0
24996,The film starts well enough. It is a truly ter...,0
24997,Some Plot Spoilers Ahead.<br /><br />The Nashv...,0
24998,I just recently viewed Shame which is directed...,0


## step1: 資料預處理

In [0]:
# 準備精選3000字詞典
from tensorflow.keras.preprocessing.text import Tokenizer

# 將資料向量化，並只保留最常出現的前 3000 個詞 (所以每個詞都是 3000 維度的向量)
tok = Tokenizer(num_words=3000)

# 類似 sklearn.feature_extraction.text.CountVectorizer 的 fit
tok.fit_on_texts(train_df["content"])

In [10]:
# 列出選擇的 3000 個詞，依照「出現次數」降序排列 
index2word = tok.index_word
word2index = tok.word_index

index2word

{1: 'the',
 2: 'and',
 3: 'a',
 4: 'of',
 5: 'to',
 6: 'is',
 7: 'br',
 8: 'in',
 9: 'it',
 10: 'i',
 11: 'this',
 12: 'that',
 13: 'was',
 14: 'as',
 15: 'for',
 16: 'with',
 17: 'movie',
 18: 'but',
 19: 'film',
 20: 'on',
 21: 'not',
 22: 'you',
 23: 'are',
 24: 'his',
 25: 'have',
 26: 'he',
 27: 'be',
 28: 'one',
 29: 'all',
 30: 'at',
 31: 'by',
 32: 'an',
 33: 'they',
 34: 'who',
 35: 'so',
 36: 'from',
 37: 'like',
 38: 'her',
 39: 'or',
 40: 'just',
 41: 'about',
 42: "it's",
 43: 'out',
 44: 'if',
 45: 'has',
 46: 'some',
 47: 'there',
 48: 'what',
 49: 'good',
 50: 'more',
 51: 'when',
 52: 'very',
 53: 'up',
 54: 'no',
 55: 'time',
 56: 'she',
 57: 'even',
 58: 'my',
 59: 'would',
 60: 'which',
 61: 'only',
 62: 'story',
 63: 'really',
 64: 'see',
 65: 'their',
 66: 'had',
 67: 'can',
 68: 'were',
 69: 'me',
 70: 'well',
 71: 'than',
 72: 'we',
 73: 'much',
 74: 'been',
 75: 'bad',
 76: 'get',
 77: 'will',
 78: 'do',
 79: 'also',
 80: 'into',
 81: 'people',
 82: 'other',
 8

In [11]:
# 對照著精選詞典，把每一篇文章的每一個詞照順序轉化成數字，
# 而不在精選詞典內的詞不會被列出，所以轉換過的數字只在 1~3000 之中
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq  = tok.texts_to_sequences(test_df["content"])

# 因為每篇文章詞數不一樣多，而最長篇的文章出現在精選3000字詞典的有 1816 個詞
# 所以總欄位(columns)有 1816 個，而其他文章不足 1816 就會補上 NaN
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,204,40,293,1679,2,48,3,1737,9.0,676.0,43.0,5.0,27.0,236.0,63.0,43.0,10.0,255.0,543.0,162.0,2903.0,41.0,48.0,567.0,5.0,2306.0,466.0,1.0,223.0,4.0,1.0,19.0,60.0,407.0,6.0,3.0,173.0,4.0,250.0,52.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,51,6,2086,20,1,1284,4,3,309.0,749.0,1.0,540.0,681.0,2.0,40.0,41.0,5.0,1484.0,5.0,24.0,338.0,9.0,1237.0,1574.0,69.0,4.0,3.0,751.0,133.0,8.0,157.0,19.0,90.0,150.0,300.0,1.0,252.0,8.0,1414.0,8.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1,2148,434,8,2315,36,207,562,30.0,1.0,950.0,1048.0,2.0,628.0,976.0,901.0,2.0,2567.0,2932.0,4.0,600.0,2.0,179.0,322.0,1525.0,97.0,27.0,20.0,5.0,199.0,175.0,104.0,1153.0,631.0,4.0,718.0,20.0,2321.0,311.0,26.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,157,28,4,145,105,22,839,41,36.0,365.0,39.0,329.0,41.0,20.0,894.0,21.0,108.0,2553.0,8.0,11.0,28.0,10.0,97.0,64.0,40.0,41.0,282.0,130.0,162.0,1445.0,5.0,3.0,182.0,247.0,36.0,3.0,348.0,127.0,341.0,510.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10,13,481,8,11,19,36,1,83.0,5.0,233.0,783.0,9.0,6.0,2095.0,321.0,16.0,772.0,4.0,218.0,2.0,201.0,367.0,2437.0,2.0,1.0,111.0,3.0,252.0,34.0,6.0,1631.0,20.0,31.0,365.0,2.0,56.0,32.0,30.0,154.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,11,6,31,1,227,246,415,4,204.0,123.0,107.0,8.0,58.0,110.0,9.0,1198.0,90.0,278.0,9.0,283.0,626.0,30.0,29.0,892.0,22.0,704.0,626.0,14.0,1291.0,2.0,1937.0,2683.0,2849.0,737.0,5.0,568.0,16.0,131.0,186.0,105.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,48,6,1,88,17,123,90,1,1722.0,4.0,2.0,1.0,4.0,1.0,325.0,209.0,4.0,310.0,20.0,1.0,314.0,143.0,249.0,15.0,12.0,4.0,1.0,19.0,1707.0,1068.0,12.0,65.0,5.0,1.0,49.0,1690.0,2062.0,47.0,6.0,61.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,72,25,29,74,2249,135,89,33,1031.0,1.0,1176.0,105.0,12.0,68.0,61.0,607.0,302.0,4.0,1.0,660.0,12.0,68.0,84.0,458.0,12.0,93.0,33.0,67.0,61.0,94.0,9.0,125.0,70.0,16.0,311.0,33.0,25.0,1.0,855.0,18.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,10,293,104,595,20,285,30,3,365.0,310.0,1.0,861.0,4.0,11.0,19.0,6.0,176.0,2434.0,715.0,325.0,2.0,1021.0,8.0,3.0,209.0,16.0,3.0,173.0,4.0,983.0,18.0,9.0,337.0,994.0,5.0,1640.0,10.0,293.0,9.0,16.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# 為了讓每篇文章詞數都一樣多，所以要「截長補短」
# 截長：預設評論前面都是廢話，所以這裡的截長是「由後往前(從NaN往前)」數 256 個詞 
# 補短：「0」就是補短的結果
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train_seq, maxlen=256, 
                            padding='pre', truncating='pre')
x_test_pad  = pad_sequences(x_test_seq, maxlen=256, 
                            padding='pre', truncating='pre')
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2137,2401,2,1,1559,9,200,21,25,1,663,349,4,3,359,2640,18,89,384,12,273,22,122,22,97,78,3,173,430,71,11,43,22,525,2588,9,49,250,690,155
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1414,8,60,6,41,5,306,80,831,134,109,7,7,196,2,2002,131,1023,30,585,31,2813,1123,1357,833,8,196,131,136,51,77,33,847,22,188,1,2633,4,84,2602
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,7,7,405,3,49,236,44,640,2232,2,26,6,70,31,1,82,153,10,423,335,1526,48,97,25,221,16,11,62,20,1,2847,4,1,2,1,1784,4,1311,8,31
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,255,543,1526,135,135,158,56,3,224,50,20,18,56,45,38,1003,14,9,502,43,7,7,40,3,475,19,1,61,152,10,89,73,37,41,9,10,101,6,1,422
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,901,36,5,938,1,538,195,15,249,1395,8,3,114,325,209,2,4,2471,858,871,185,1,4,3,84,19,42,1,429,4,19,359,63,490,5,94,18,40,188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,7,7,1,279,10,90,1289,2190,6,85,9,1174,46,2527,136,18,1,357,13,1554,169,16,2816,90,54,278,18,1,989,12,1026,91,430,71,1289,2190,2,2816,273,292
24996,4,726,8,19,7,7,54,10,121,48,13,88,1192,257,5,103,2062,830,866,1209,5,2004,397,24,9,6,138,14,11,12,76,7,7,2723,303,5,25,185,1,2001,...,5,103,21,1,1318,4,2487,1203,18,4,1259,2062,2604,8,3,1169,4,31,159,2128,31,3,1947,8,11,417,1,1179,3,1695,606,30,37,12,9,457,3,240,4,988
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,5,2334,1,360,243,8,1,275,36,1529,11,19,5,608,297,30,1,950,1048,1176,99,23,3,173,4,250,18,8,417,9,90,69,178,5,1,201,204,107,2,669
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,298,1,61,152,12,163,9,57,217,287,146,6,1367,14,1,1464,129,597,4,1,102,23,1383,1,1841,35,1524,42,751,2,6,12,610,466,1,19,421,5,27,160


In [13]:
# 補短範例
print('第一篇文章有', len(x_train_seq[0]), '個字，詳列如下：')
print(x_train_seq[0])
print()
print('文章詞數需有', len(x_train_pad[0]), '個字，若詞數不足就補「0」，詳列如下：')
print(x_train_pad[0])

第一篇文章有 119 個字，詳列如下：
[204, 40, 293, 1679, 2, 48, 3, 1737, 9, 676, 43, 5, 27, 236, 63, 43, 10, 255, 543, 162, 2903, 41, 48, 567, 5, 2306, 466, 1, 223, 4, 1, 19, 60, 407, 6, 3, 173, 4, 250, 52, 16, 3, 49, 1489, 4, 70, 102, 176, 32, 1066, 42, 343, 617, 55, 47, 23, 955, 4, 49, 1293, 466, 14, 70, 9, 77, 398, 22, 363, 1, 127, 82, 102, 5, 103, 43, 15, 23, 1, 481, 2137, 2401, 2, 1, 1559, 9, 200, 21, 25, 1, 663, 349, 4, 3, 359, 2640, 18, 89, 384, 12, 273, 22, 122, 22, 97, 78, 3, 173, 430, 71, 11, 43, 22, 525, 2588, 9, 49, 250, 690, 155]

文章詞數需有 256 個字，若詞數不足就補「0」，詳列如下：
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    

In [14]:
# 截長範例
print('最後一篇文章有', len(x_train_seq[24999]), '個字，詳列如下：')
print(x_train_seq[24999])
print()
print('我只取文章中的', len(x_train_pad[24999]), '個字，詳列如下：')
print(x_train_pad[24999])

最後一篇文章有 109 個字，詳列如下：
[607, 35, 42, 3, 361, 349, 19, 10, 340, 1, 85, 9, 13, 321, 8, 706, 371, 1, 113, 6, 1, 2436, 6, 1320, 1, 538, 269, 37, 9, 382, 36, 1693, 2, 204, 107, 50, 1075, 90, 36, 2195, 2, 1, 1520, 1224, 269, 37, 33, 340, 5, 20, 1, 153, 7, 7, 10, 235, 27, 499, 5, 131, 361, 349, 44, 47, 68, 46, 776, 1497, 44, 3, 17, 90, 32, 586, 5, 373, 3, 62, 18, 11, 1593, 4, 371, 926, 67, 4, 54, 111, 54, 144, 102, 2, 54, 42, 3, 529, 945, 7, 7, 2, 89, 54, 1027, 130, 222, 424, 161, 5, 2369]

我只取文章中的 256 個字，詳列如下：
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 

In [0]:
# 不事先對答案做 one-hot encoding
import numpy as np

y_train = np.array(train_df["sentiment"], dtype="int")
y_test  = np.array(test_df["sentiment"], dtype="int")

## step2: 訓練模型

### 建立模型：Sequential

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Flatten, Dropout

model = Sequential()
                                           # 注意！在 NLP 中「0」會保留給 padding
model.add(Embedding(input_dim=3001,        # 3000種詞 + padding 的「0」 = 3001種
                    input_length=256,      # input數量要一樣多，所以截長補短成 256個詞
                    output_dim=64,         # 64個語意/感受/情緒(詞向量)
                    mask_zero=True         # 不要把 padding 的「0」 視為一個特徵
                    ))                     # Param = 3001種 * 64個特徵 = 192064

model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.25))
model.add(Dense(2, activation="softmax"))  # 需對答案做 one-hot encoding
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 64)           192064    
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               2097280   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 2,289,602
Trainable params: 2,289,602
Non-trainable params: 0
_________________________________________________________________


### 確定模型訓練方式

In [0]:
# 若是 model.add(Dense(units=1, activation="sigmoid"))
# 則要 model.compile(loss="binary_crossentropy",

from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer="adam", 
              metrics=["accuracy"])

### 訓練模型

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

stop_callback = EarlyStopping(patience=3, restore_best_weights=True)
save_callback = ModelCheckpoint("sentiment.h5", save_best_only=True)

model.fit(x_train_pad, 
          y_train, 
          batch_size=200, 
          epochs=100,
          callbacks=[stop_callback, save_callback], 
          validation_split=0.1, 
          verbose=2)

Epoch 1/100
113/113 - 8s - loss: 0.5319 - accuracy: 0.7036 - val_loss: 0.3052 - val_accuracy: 0.8644
Epoch 2/100
113/113 - 8s - loss: 0.2061 - accuracy: 0.9226 - val_loss: 0.4187 - val_accuracy: 0.8232
Epoch 3/100
113/113 - 8s - loss: 0.0635 - accuracy: 0.9827 - val_loss: 0.3987 - val_accuracy: 0.8620
Epoch 4/100
113/113 - 8s - loss: 0.0134 - accuracy: 0.9982 - val_loss: 0.5651 - val_accuracy: 0.8392


<tensorflow.python.keras.callbacks.History at 0x7fe1ab7360f0>

## step3: 預測結果

In [19]:
pre = np.argmax(model.predict(x_test_pad), axis=-1)
pre

array([1, 1, 1, ..., 0, 1, 0])

In [20]:
# 查看文章情緒
from tensorflow.keras.models import Model

partial = Model(model.input, model.layers[0].output)

# 有 25000 篇文章，每篇文章有 256 個詞，每個詞有 64 個維度向量
print(partial.predict(x_test_pad).shape, end='\n\n')

# 取出第一篇文章 第一個詞 的 64 個維度向量
print(partial.predict(x_test_pad)[0][0], end='\n\n')
print('average:', np.average(partial.predict(x_test_pad)[0][0]))

(25000, 256, 64)

[-1.2825865e-02 -6.4815889e-04  2.7761883e-03 -3.1045147e-03
 -5.1833019e-03 -9.5896487e-04 -8.8673187e-03  1.3953440e-02
  1.3106449e-03 -5.8280807e-03  2.7724942e-03  2.4341168e-03
  1.1438569e-02 -2.4555363e-03 -4.9036276e-03 -4.1691237e-03
  4.3116082e-03  8.7011326e-03  5.0012954e-04 -1.2904203e-03
 -1.5947665e-03 -2.0286562e-02  7.5638518e-03 -7.0303921e-03
 -1.9455806e-03 -3.0703722e-03 -1.0551834e-02 -2.9572458e-03
 -1.0718222e-02  2.5127985e-04  4.7404263e-03 -2.4503481e-03
 -5.8259317e-03  4.3428801e-03  1.2995608e-02  1.1948263e-03
  2.5228920e-04  8.9350417e-03  5.6787264e-03  1.7431129e-03
 -4.5321067e-06  1.4274125e-04  5.7742591e-03  9.5780734e-03
 -2.5936381e-03  9.2259087e-03 -5.0408621e-03  6.3190600e-03
  4.5449268e-03 -7.6703844e-03  1.7432457e-02 -2.2657514e-02
 -1.3422180e-02 -7.5165974e-03  6.0356222e-04  6.4169877e-04
  8.1777247e-03 -6.1237663e-03 -2.0006513e-02 -3.7386448e-03
  7.6322290e-03  9.6208951e-04  9.3710162e-03 -5.9041660e-03]

aver

## step4: 驗證模型

In [21]:
model.evaluate(x_test_pad, y_test)



[0.3199652135372162, 0.8610399961471558]

## 使用者輸入「詞」，將「詞」轉成「詞向量」

In [22]:
inference = Sequential()
inference.add(Embedding(input_dim=3001,
                        output_dim=64))

# 取得之前訓練模型的權值/參數
weight = model.layers[0].get_weights()

# 注意！input_dim 和 output_dim 要一樣才能複製來使用
inference.set_weights(weight)
inference.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          192064    
Total params: 192,064
Trainable params: 192,064
Non-trainable params: 0
_________________________________________________________________


In [23]:
word = input('請輸入你想轉換的詞：')
data = [[word2index[word]]]
print('詞向量：\n', inference.predict(data))

請輸入你想轉換的詞：crazy
詞向量：
 [[[ 0.01763171  0.01192511 -0.01953395 -0.0548807  -0.00266308
    0.03334697 -0.01998972  0.04573987 -0.01516189 -0.01411523
    0.02756156  0.03669238  0.06408543  0.01398283  0.0363889
   -0.05363119 -0.02154821 -0.02394863 -0.04626471  0.03656537
   -0.01906935  0.04671087 -0.03088018  0.03764979 -0.01196658
   -0.0173284  -0.02821687  0.02960247 -0.00713556 -0.0148444
    0.04095268 -0.00499463 -0.01437375  0.00365028  0.03259805
   -0.0355837   0.04672607 -0.02771533 -0.03293008 -0.01716172
    0.01952889  0.0560222  -0.01333134 -0.02280795  0.04179186
   -0.04853619 -0.05014278  0.05536589 -0.0343472   0.00256625
    0.00060062  0.02330355 -0.00849306 -0.0053474   0.04064495
    0.01033968  0.03769073 -0.0177048   0.00709884 -0.02200131
    0.03126921 -0.03826933  0.00071242 -0.02230404]]]
