In [None]:
from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size)

text_only_train_ds = train_ds.map(lambda x, y, : x)

In [None]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode = 'int',
    output_sequence_length=max_length,
)

text_vectorization.adapt(text_only_train_ds)

In [None]:
int_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls=4)

int_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls=4)

int_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls=4)

In [None]:
import tensorflow as tf

In [None]:
# 모델 정의
inputs = keras.Input(shape=(None,), dtype='int64')  # 'stype' -> 'dtype'
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)

# 모델 컴파일
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# 모델 요약 정보 출력
model.summary()

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('one_hot_bidir_lstm.h5', save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

In [None]:
model = keras.models.load_model('one_hot_bidir_lstm.h5')
print(f'테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}') # 0.88

## 임베딩(Embedding)
: 고차원 데이터를 저차원 벡터로 변환하는 기술

In [None]:
# Embedding Layer : 신경망에서 범주형 데이터 중 주로 텍스트 데이터를 밀집된 연속 벡터 표현으로 변환하는데 사용되는 층이다.
embedding_layer = layers.Embedding(input_dim = max_tokens, output_dim=256)

In [None]:
# shape = (None) : 스칼라 값
# shape = (None,) : 유동적인 1차원 벡터
inputs = keras.Input(shape=(None, ), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs) # input_dim = 최대 토큰 수, output_dim = 임베딩 벡터 차원 수
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('embeddings_bidir_lstm.h5', save_best_only=True)
]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

In [None]:
model = keras.models.load_model('embeddings_bidir_lstm.h5')
print(f'테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}')

In [None]:
# 마스킹을 활성화한 Embedding 층
# 마스킹(Masking) : 시퀸스 데이터를 처리할 때, 특정 토큰이 모델의 학습이나 예측에 영향을 미치지 않도록 무시하는 기법이다.
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs) # 입력 시퀸스에서 값이 0인 토큰을 무시한다.

x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('embedding_bidir_lstm_with_masking.h5', save_best_only=True)
]
model.fit(int_train_ds, validation_data = int_val_ds, epochs=10, callbacks=callbacks)

In [None]:
model = keras.models.load_model('embedding_bidir_lstm_with_masking.h5')
print(f'테스트 정확도: {model.evaluate(int_test_ds)[1]:.3f}')

In [None]:
import requests
import zipfile
import os

url = "http://nlp.stanford.edu/data/glove.6B.zip"
zip_file = 'golve.6B.zip'

response = requests.get(url)
with open(zip_file, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
os.remove(zip_file)

# GloVe : SVD와 skip-gram 결합한 모델

In [None]:
import tensorflow as tf