In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models
from keras import layers

In [3]:
import tensorflow_hub as hub

In [4]:
import numpy as np
import matplotlib.pyplot as plt

In [5]:
import os

In [6]:
from google.colab import drive

In [7]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. IMDB Dataset

### 1) Load IMDB Dataset

In [None]:
!unzip /content/drive/MyDrive/Colab\ Notebooks/datasets/IMDB.zip

### 2) 'text' and 'labels' Data
- 'texts': 문자열 리스트(영화 감상평)
- 'labels': 감상평 리뷰(긍정/부정)

In [8]:
imdb_dir = 'aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg' , 'pos'] :
  dir_name = os.path.join(train_dir, label_type)
  for fname in os.listdir(dir_name) : 
    if fname[-4:] == '.txt':
      f = open(os.path.join(dir_name, fname) , encoding = 'utf8')
      texts.append(f.read())
      f.close()
      if label_type == 'neg':
        labels.append(0)
      else:
        labels.append(1)

In [9]:
len(labels), len(texts)

(25000, 25000)

## 2. Tensor Transformation

### 1) X_train and X_valid: (25000, 2000)
- vectorization
  - (25000, 2000)

In [10]:
maxlen = 2000
max_words = 10000

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

In [11]:
word_index = tokenizer.word_index
print('고유 토큰 수: ' , len(word_index))

고유 토큰 수:  88582


In [12]:
# padding
data = pad_sequences(sequences, maxlen = maxlen)
labels = np.asarray(labels)

print('데이터 텐서 크기: ' , data.shape)
print('레이블 텐서 크기: ' , labels.shape)

데이터 텐서 크기:  (25000, 2000)
레이블 텐서 크기:  (25000,)


In [13]:
# random sampling
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [14]:
# train_validation_split
training_samples = 20000
validation_samples = 5000

X_train = data[:training_samples]
y_train = labels[:training_samples]
X_valid = data[training_samples : training_samples + validation_samples]
y_valid = labels[training_samples : training_samples + validation_samples]

## 3. Word2Vec

### 1) Load pretrained Word2Vec

In [15]:
embedding_index = hub.load('https://tfhub.dev/google/Wiki-words-250/2')

In [16]:
embedding_index(['apple'])

<tf.Tensor: shape=(1, 250), dtype=float32, numpy=
array([[-7.81319588e-02, -7.97718316e-02,  1.65636316e-02,
        -1.08001232e-02, -5.10360440e-03,  1.73767412e-03,
        -5.22104278e-02, -3.89753021e-02,  3.56903672e-02,
        -3.47909741e-02, -1.01491222e-02,  1.17565657e-03,
         1.01802059e-01,  1.75360963e-02,  3.36469710e-02,
         2.79656947e-02,  9.57141817e-02, -7.82085657e-02,
         5.06314561e-02, -1.66016668e-01,  2.88206208e-02,
         6.76635057e-02,  9.70917642e-02,  1.79236010e-02,
        -5.42766303e-02, -1.56506345e-01, -5.30809052e-02,
        -1.09254161e-03, -1.59554277e-02, -6.70691356e-02,
         6.15172908e-02,  4.47090678e-02,  4.07696068e-02,
        -3.83969024e-02,  6.96176291e-02, -5.60147781e-03,
        -2.26747449e-02, -3.67878452e-02, -5.66431917e-02,
        -2.18681507e-02, -8.86453837e-02, -2.22746611e-01,
        -2.07957737e-02,  8.30694276e-04, -3.93166617e-02,
         5.93367852e-02,  4.33623493e-02,  2.82799695e-02,
      

### 2) 임베딩 행렬 생성
- (10000, 250)

In [17]:
embedding_dim = 250
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
  embedding_vector = embedding_index([word])
  if i < max_words:
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

In [18]:
embedding_matrix.shape

(10000, 250)

## 4. Keras Embedding Modeling

### 1) Model Define
- 모델 신경망 구조 정의
  - Embedding Dimension: 250

In [19]:
imdb = models.Sequential()

imdb.add(layers.Embedding(max_words,
                          embedding_dim,
                          input_length = maxlen))
imdb.add(layers.LSTM(16))
imdb.add(layers.Dropout(0.5))
imdb.add(layers.Dense(1, activation = 'sigmoid'))

In [20]:
imdb.layers[0].set_weights([embedding_matrix])
imdb.layers[0].trainable = False

In [21]:
imdb.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2000, 250)         2500000   
_________________________________________________________________
lstm (LSTM)                  (None, 16)                17088     
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 2,517,105
Trainable params: 17,105
Non-trainable params: 2,500,000
_________________________________________________________________


In [22]:
imdb.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])

In [23]:
%%time

hist_imdb = imdb.fit(X_train, y_train,
                     epochs = 100,
                     batch_size = 512,
                     validation_data = (X_valid, y_valid))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### 2) Visualization

In [None]:
epochs = range(1, len(hist_imdb.history['accuracy']) + 1)

plt.figure(figsize = (9, 6))
plt.plot(epochs, hist_imdb.history['accuracy'])
plt.plot(epochs, hist_imdb.history['val_accuracy'])
plt.title('Training & Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(['Training Accuracy' , 'Validation Accuracy'])
plt.grid()
plt.show()

### 3) Model Evaluate
- Loss & Accuracy

In [None]:
loss, accuracy = imdb.evaluate(X_valid, y_valid)

print('Loss = {:.5f}' .format(loss))
print('Accuracy = {:.5f}' .format(accuracy))