<a href="https://colab.research.google.com/github/welting82/AIclass/blob/main/imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
dataset

'/root/.keras/datasets/aclImdb.tar.gz'

In [3]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df

In [4]:
train_df = getdata("train")
test_df = getdata("test")

In [5]:
test_df

Unnamed: 0,content,sentiment
0,"Gary Cooper, (Michael Brandon) played the role...",1
1,"This film is a tapestry, a series of portraits...",1
2,i see there are great reviews of this film alr...,1
3,This film says everything there is to say abou...,1
4,Apparently this Australian film based on Nevil...,1
...,...,...
24995,"I like J-horror, anime and even kinda dig the ...",0
24996,"This movie is not worth seeing, at least not a...",0
24997,Jim Varney's first real movie is quite a delig...,0
24998,After reading the mostly glowing comments abou...,0


In [6]:
# Tokenize: 把你的詞變成數字
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [7]:
# tok.word_index
# tok.index_word

In [8]:
# Sequence: 化成數字的序列
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])

In [9]:
INPUT_LENGTH = 512
INPUT_DIM = 3000
OUTPUT_DIM = 128

In [10]:
# pd.DataFrame(x_train_seq)
# Padding: 截長補短變成一樣長
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=INPUT_LENGTH)
x_test_pad = pad_sequences(x_test_seq, maxlen=INPUT_LENGTH)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,15,69,13,1,2103,353,7,7,1,1321,344,6,45,332,3,334,4,1,198,487,232,205,30,341,16,310,9,63,1,4,49,1294,18,8,1,127,6,181,1537,160
1,1,4,15,12,840,1975,2696,2,41,1,2753,5,60,10,67,61,350,5,10,1777,1,290,28,66,74,4,3,1730,873,10,89,121,614,51,2696,410,1,1481,26,147,...,14,73,14,26,59,69,187,1,4,1,28,109,98,26,200,36,3,2032,1481,6,21,167,5,78,87,98,49,8,138,3,418,1,61,28,5,6,1,144,1481,306
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,14,1631,91,2975,6,139,642,259,9,1074,9,201,1098,15,1,15,1,857,2,628,15,1,295,7,7,11,17,77,137,14,28,4,11,115,99,7,7,137,103,11
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,66,570,12,51,316,9,9,59,239,40,27,57,50,218,1740,6,3,49,2,218,17,42,3,17,10,262,313,67,354,42,240,4,3,232,49,17,21,75,30,29
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,166,9,875,5,5,410,1,106,1,110,1,281,405,44,5,137,721,2,1,62,130,196,1,153,78,40,1,205,289,2,156,6,157,60,163,11,28,57,50,305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3,17,14,3,765,1083,6,63,39,44,73,380,1269,504,196,1902,10,37,1512,18,24,106,6,181,28,851,1,115,170,4,1,19,5,69,23,305,1347,2,84,643
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,969,36,1809,2695,22,239,23,21,167,5,37,11,259,85,4,3,173,1,1,659,1388,8,9,7,7,634,199,11,17,3,672,4,339,379,43,4,3,611,155,379
24997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,65,2658,15,1114,150,39,139,35,12,147,12,504,48,2761,33,67,94,99,8,82,1938,33,178,770,1,275,384,69,22,786,242,36,1079,1861,2,42,15,126,202,49
24998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,41,9,824,90,1556,135,1,1485,7,7,1654,135,10,121,9,66,5,78,16,7,7,1,111,18,206,95,30,219,47,59,25,74,28,7,7,1562,174,1676,5,374


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout
layers = [
    # 3001(種詞) * 128(個情緒)
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True, input_length=INPUT_LENGTH),
    Flatten(),
    Dense(256, activation="relu"),
    Dropout(0.25),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 128)          384128    
                                                                 
 flatten (Flatten)           (None, 65536)             0         
                                                                 
 dense (Dense)               (None, 256)               16777472  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 514       
                                                                 
Total params: 17,162,114
Trainable params: 17,162,114
Non-trainable params: 0
_________________________________________________________________


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
layers = [
    # 3001(種詞) * 128(個情緒)
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True, input_length=INPUT_LENGTH),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 128)          384128    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 384,386
Trainable params: 384,386
Non-trainable params: 0
_________________________________________________________________


In [13]:
# 一個輸出(二元分類): BinaryCrossEntropy p log 1/q + (1 - p) log 1/1-q
# 多個輸出(多元分類): CategoricalCrossEntropy pi log1/qi
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
              # "adam"也可以
              optimizer="adam",
              metrics=["accuracy"])

In [14]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [15]:
# batch_size: 看多少筆, 做一次梯度下降(幾10~幾100)
# epochs: 所有資料看幾輪(負責結束訓練)
# batch_size=200
# 一epochs: 54000 / 200 = 270(次梯度下降)
# verbose=0(quiet) 1(default) 2(no bar)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ModelCheckpoint("imdb.h5", save_best_only=True)
]
model.fit(x_train_pad,
          y_train,
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          verbose=2,
          callbacks=callbacks)


Epoch 1/100
113/113 - 4s - loss: 0.6353 - accuracy: 0.6408 - val_loss: 0.5908 - val_accuracy: 0.7292 - 4s/epoch - 31ms/step
Epoch 2/100
113/113 - 1s - loss: 0.4683 - accuracy: 0.8256 - val_loss: 0.4507 - val_accuracy: 0.8100 - 1s/epoch - 11ms/step
Epoch 3/100
113/113 - 1s - loss: 0.3643 - accuracy: 0.8646 - val_loss: 0.3943 - val_accuracy: 0.8248 - 1s/epoch - 11ms/step
Epoch 4/100
113/113 - 1s - loss: 0.3138 - accuracy: 0.8806 - val_loss: 0.3316 - val_accuracy: 0.8580 - 1s/epoch - 11ms/step
Epoch 5/100
113/113 - 1s - loss: 0.2849 - accuracy: 0.8909 - val_loss: 0.3530 - val_accuracy: 0.8416 - 1s/epoch - 11ms/step
Epoch 6/100
113/113 - 1s - loss: 0.2652 - accuracy: 0.8983 - val_loss: 0.3192 - val_accuracy: 0.8640 - 1s/epoch - 11ms/step
Epoch 7/100
113/113 - 1s - loss: 0.2509 - accuracy: 0.9034 - val_loss: 0.3471 - val_accuracy: 0.8484 - 1s/epoch - 11ms/step
Epoch 8/100
113/113 - 1s - loss: 0.2405 - accuracy: 0.9074 - val_loss: 0.3581 - val_accuracy: 0.8436 - 1s/epoch - 11ms/step
Epoch 9/

<keras.callbacks.History at 0x7f5ad3e6b350>

In [16]:
model.evaluate(x_test_pad, y_test)



[0.30602484941482544, 0.8762000203132629]

In [17]:
l = [
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True)
]
remain = model.layers[1:]
model_use = Sequential(l+remain)
model_use.layers[0].set_weights(model.layers[0].get_weights())
model_use.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 128)         384128    
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 384,386
Trainable params: 384,386
Non-trainable params: 0
_________________________________________________________________


In [18]:
review = ""#@param {type:"string"}
review_seq = tok.texts_to_sequences([review])
proba = model_use.predict(review_seq)[0]
trans = ["neg", "pos"]
for p, sentiment in zip(proba, trans):
    print(sentiment, ":", p)

KeyError: ignored

In [None]:
# RNN -> 改進 LSTM, GRU
from tensorflow.keras.layers import SimpleRNN
layers = [
    # 3001(種詞) * 128(個情緒)
    Embedding(INPUT_DIM+1, OUTPUT_DIM, mask_zero=True, input_length=INPUT_LENGTH),
    # Embedding W: 128 * 64 + 記憶 W: 64 * 64 + bias: 64 = 12352
    SimpleRNN(64),
    Dense(2, activation="softmax")
]
rnn = Sequential(layers)
rnn.summary()

In [None]:
l = [
     Embedding(INPUT_DIM+1, OUTPUT_DIM),
     GlobalAveragePooling1D()
]
partial = Sequential(l)
partial.layers[0].set_weights(model.layers[0].get_weights())
partial.summary()

In [None]:
from scipy.spatial.distance import cosine
w1 = "This movie sucks"#@param {type:"string"}
w2 = "I hate this movie. It's totally a disaster."#@param {type:"string"}
seq1 = tok.texts_to_sequences([w1])
seq2 = tok.texts_to_sequences([w2])
em1 = partial.predict(seq1)[0]
em2 = partial.predict(seq2)[0]
dis = 1 - cosine(em1, em2)
print("cos相似度:", dis)