In [5]:
from tensorflow.keras.datasets import imdb
import numpy as np

# data loading (most frequent 10k words only)
(train_data, train_lables), (test_data, test_lables) = imdb.load_data(num_words=10000)

# word & index dictionary mapping
word_index = imdb.get_word_index() # type = dict

# reversed dictionary mapping
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Decoding the review
# 0 -> padding, 1 -> starts of sequence, 2 -> unknown
decoded_review = " ".join([reverse_word_index.get(i-3, "?") for i in train_data[819]])

print(decoded_review)

? centered in the downtown and out ? of detroit this comedy i found to be a terrific new comedic duo ? pat ? is a very funny man who happens to be a cop from japan on the trail of an industrial secrets thief who has stolen a ? ? ? super ? reluctantly he goes to the united states to follow the thief after being ordered by his commander ? character ? with ? ? character a fast ? but down to business player type detroit cop when they cross paths though the honorable ? of japan meet the all out old school detroit police ? ? the two stumble and trip over each other at first but then develop a ? that turns into an explosive two layered ? team that ? the case cold after battling a city crime boss for the stolen ? and closing the case these two go from ? each other to being friends and working well together a little worse for wear and in need of an extended vacation on top of it all they manage to come to a ? closing i rated this a 9 ? direction ? this a near perfect comedy fun for all ages i r

>> ? centered in the downtown and out ? of detroit this comedy i found to be a terrific new comedic duo ? pat ? is a very funny man who happens to be a cop from japan on the trail of an industrial secrets thief who has stolen a ? ? ? super ? reluctantly he goes to the united states to follow the thief after being ordered by his commander ? character ? with ? ? character a fast ? but down to business player type detroit cop when they cross paths though the honorable ? of japan meet the all out old school detroit police ? ? the two stumble and trip over each other at first but then develop a ? that turns into an explosive two layered ? team that ? the case cold after battling a city crime boss for the stolen ? and closing the case these two go from ? each other to being friends and working well together a little worse for wear and in need of an extended vacation on top of it all they manage to come to a ? closing i rated this a 9 ? direction ? this a near perfect comedy fun for all ages i recommend it highly

NN의 input dimension은 항상 고정되어 있어야 합니다. 그러나 지금 input 으로 들어오는 text data의 경우 len(text data)의 값이 모두 다르기 때문에, text data를 크기가 고정된 무언가로 변환해야하는데, 아래에 정의된 vectorize_sequences() 가 이 역할을 수행해줍니다.
먼저 shape이 (len(sequences), dimension) 인 zero matrix 를 만든 후, 각각의 txt data에서 해당 단어가 존재하는 위치를 1이 되도록 만들어줍니다.

In [6]:
# Vectorizing function (One-Hot Encoding)
# Transform a review to a vector data
def vectorize_sequences(sequences, dimension=10000):
    # initializing the storing space
    # Zero matrix of shape (len(sequences), dimension)
    results = np.zeros((len(sequences), dimension))
    for i, sequences in enumerate(sequences):
        # set specific indices of results[i] to 1
        results[i, sequences] = 1

    return results

In [7]:
# plot_loss & plot_acc definition
import matplotlib.pyplot as plt

def plot_acc(h, title='accuracy'):
    plt.plot(h.history['accuracy'])
    plt.plot(h.history['val_accuracy'])
    plt.title(title)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Training', 'Validation'], loc=0)

def plot_loss(h, title='loss'):
    plt.plot(h.history['loss'])
    plt.plot(h.history['val_loss'])
    plt.title(title)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Training', 'Validation'], loc=0)
# Vectorize
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

# re-format the data type (float data type as an inputs of MLP)
# y = positive(1) of negative(0)
y_train =np.asarray(train_lables).astype("float32")
y_test =np.asarray(test_lables).astype("float32")

In [8]:
word_index_of_X = train_data[819][0:5]
print(word_index_of_X)
print([reverse_word_index.get(i-3, "?") for i in word_index_of_X])
print(x_train[819][word_index_of_X[1]])
print(x_train[819][word_index_of_X[3]])

[1, 4012, 11, 4, 9403]
['?', 'centered', 'in', 'the', 'downtown']
1.0
1.0


In [13]:
# Modeling
from tensorflow.keras import models, layers
from tensorflow.keras import optimizers, losses, metrics

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid')) # output is 0 or 1


# model.complie(optimizer='rmsprop', loss='binary_crossentropy',
#               metrics=['accuracy'])

# customizing learning rate
model.compile(optimizer=optimizers.RMSprop(lr=0.001), loss='binary_crossentropy',
              metrics=['accuracy'])
# test validation split (10000 examples)
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
# model learning
history = model.fit(partial_x_train, partial_y_train, 
                    epochs=20, batch_size=512, validation_data=(x_val, y_val))

# model evaluation
results = model.evaluate(x_test, y_test)
print(results)

  super(RMSprop, self).__init__(name, **kwargs)


Epoch 1/20


2023-04-09 06:24:52.983028: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-04-09 06:24:54.441059: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[0.751843273639679, 0.8512799739837646]
 56/782 [=>............................] - ETA: 2s

2023-04-09 06:25:10.356840: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




학습을 마치고 결과를 확인하기 위해서는 keras package 안에 있는 model.predict() 를 사용하면 됩니다.
25000개의 test dataset(x_test) 에 대응하는 결과로 긍정(1) 또는 부정(0)을 나타내는 25000개의 output dataset으로 표현되는 것을 확인했습니다.

In [19]:
# model prediction
predict_results = model.predict(x_test)
print(len(predict_results))
print(predict_results)

25000
[[0.00504884]
 [0.99999976]
 [0.99662364]
 ...
 [0.00290497]
 [0.01071857]
 [0.690046  ]]


819번째 리뷰를 제가 직접 읽어봤을 때 "fun", "recommend", "perfect comedy" 라는 단어가 있는 것으로 보아 긍정적인 리뷰라고 판단했습니다.
모델이 예측한 결과를 확인하기 위해 model.predixt(x_test) 를 predict_results 라는 새로운 변수에 할당시키고, 819번째 결과값을 출력했습니다.
결과는 0.9999989 로, 모델이 predict 한 값도 마찬가지로 긍정적인 리뷰라고 판단했으며 제가 생각한 결과와 일치하는 것을 확인했습니다.

In [20]:
# X th text sample output
print(predict_results[819])

[0.9999989]


In [None]:
# visualization
plot_loss(history)
plt.show()
# plt.savefig('chapter2-1.loss.png')

plt.clf()
plot_acc(history)
plt.show()
# plt.savefig('chapter2-1.accuracy.png')
plt.clf()

In [None]:
# early stopping

from tensorflow.keras.callbacks import EarlyStopping

history = model.fit(partial_x_train, partial_y_train, 
                    epochs=20, batch_size=512, validation_data=(x_val, y_val), 
                    callbacks = [EarlyStopping(monitor='val_loss', patience=1)])

# visualization
plot_loss(history)
plt.show()
# plt.savefig('chapter2-1.loss.png')

plt.clf()
plot_acc(history)
plt.show()
# plt.savefig('chapter2-1.accuracy.png')
plt.clf()