In [7]:
# 벡터화((텍스트를 벡터로 변환)
import string

class Vectorizer:
    def standardize(self, text):
        text = text.lower() # 텍스트를 소문자로 변환한다.
        return "".join(char for char in text if char not in string.punctuation) # 구두점을 제거한다.

    def tokenize(self, text):
        return text.split() # 공백을 기준으로 토큰화

    def make_vocabulary(self, dataset):
        self.vocabulary={"":0, "[UNK]":1}
        for text in dataset:
            text = self.standardize(text) # 표준화
            tokens = self.tokenize(text) # 토큰화
            for token in tokens: 
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
                    # 숫자와 value를 뒤바꾼 형태로 사전 제작('숫자-단어-숫자-단어' 형태로)

        self.inverse_vocabulary = dict( # 텍스트를 정수 형태로
            (v, k) for k, v in self.vocabulary.items()) 

    def encode(self, text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

vectorizer = Vectorizer()

In [12]:
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms.',
]
vectorizer.make_vocabulary(dataset)

test_sentence = "I write, rewrite, and still rewrite again"
print(vectorizer.vocabulary)

{'': 0, '[UNK]': 1, 'i': 2, 'write': 3, 'erase': 4, 'rewrite': 5, 'again': 6, 'and': 7, 'then': 8, 'a': 9, 'poppy': 10, 'blooms': 11}


In [14]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [15]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [21]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode='int')

In [22]:
import re
import tensorflow as tf

def custom_standard_fn(string_tensor):
    lowercase_string = tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(lowercase_string, f'[{re.escape(string.punctuation)}]', '')

def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

In [24]:
text_vectorization = TextVectorization(
    output_mode='int',
    standardize=custom_standard_fn,  # 'standard' 대신 'standardize'
    split=custom_split_fn  # 'split'은 올바르게 사용됨
)


In [25]:
text_vectorization.adapt(dataset)
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [27]:
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [28]:
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))
drcoded_sentence = ' '.join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [29]:
import requests

url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
response = requests.get(url, stream=True)

with open('aclImdb_v1.tar.gz', 'wb') as file:
    for chunk in response.iter_content(chunk_size=8192):
        file.write(chunk)

In [30]:
import tarfile

with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [31]:
import shutil

shutil.rmtree('aclImdb/train/unsup', ignore_errors=True)

In [32]:
file_path = 'aclImdb/train/pos/4077_10.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    print(file.read())

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy


In [36]:
import os
import pathlib
import random

base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'

In [37]:
for category in ('neg', 'pos'):
    (val_dir/category).mkdir(parents=True, exist_ok=True)

    files = os.listdir(train_dir/category)
    random.Random(42).shuffle(files)

    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]

    for fname in val_files:
        shutil.move(train_dir/category/fname, val_dir/category/fname)

In [46]:
from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size)

val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size)

test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [47]:
for inputs, targets in train_ds:
    print('inputs.shape : ', inputs.shape)
    print('inputs.dtype : ', inputs.dtype)
    print('targets.shape : ', targets.shape)
    print('targets.dtype : ', targets.dtype)
    print('inputs[0] : ', inputs[0])
    print('targets[0] : ', targets[0])
    break

inputs.shape :  (32,)
inputs.dtype :  <dtype: 'string'>
targets.shape :  (32,)
targets.dtype :  <dtype: 'int32'>
inputs[0] :  tf.Tensor(b'Some films manage to survive almost on originality alone - "Wonderland" is certainly one of those films. The script manages to throw everything into a near-fever pitch, but without making it incoherent. The speed of this thriller is not to chosen to cover up a weak script, but rather to accurately reflect the drug-addled reality.<br /><br />As director, James Cox as a very peculiar way of working his actors. Most of the characters are perpetually on edge, and often because they\'re rather quite ugly personalities. Val Kilmer has described John Holmes to be a hustler, able to manipulate and control. No offense to Kilmer, but his version of Holmes seems only able to control the drastically weak-minded. Nonetheless, it\'s a stunning performance. Comparing this to Kilmer\'s more \'Hollywood\' roles like in "The Saint" it seems to prove he is far more at 

# Bag of Words(Bow)
: 구조나 순서를 무시하고 단어의 빈도에 초점을 맞춰서 텍스트 데이터를 수치화하여 모델 입력을 할 수 있도록 하는 기법
- 단어 집합(Vocabulary) <-> 단어 빈도(Word Frequency) ==> 텍스트 데이터를 벡터로 표현하였다.

In [48]:
text_vectorization = TextVectorization(
    max_tokens = 20000, # 최대 20,000개의 단어
    output_mode = 'multi_hot', # 벡터화 방식 multi-hot encoding
)

text_only_train_ds = train_ds.map(lambda x, y : x) # 훈련 데이터에서 입력 데이터(x)만 추출
text_vectorization.adapt(text_only_train_ds)

In [50]:
binary_lgram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y), # 입력 데이터 벡터화
    num_parallel_calls=4 # CPU 코어
)

binary_lgram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y), # 입력 데이터 벡터화
    num_parallel_calls=4 # CPU 코어
)

binary_lgram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y), # 입력 데이터 벡터화
    num_parallel_calls=4 # CPU 코어
)

In [53]:
for inputs, targets in binary_lgram_train_ds:
    print('inputs.shape : ', inputs.shape)
    print('inputs.dtype : ', inputs.dtype)
    print('targets.shape : ', targets.shape)
    print('targets.dtype : ', targets.dtype)
    print('inputs[0] : ', inputs[0])
    print('targets[0] : ', targets[0])
    break

inputs.shape :  (32, 20000)
inputs.dtype :  <dtype: 'float32'>
targets.shape :  (32,)
targets.dtype :  <dtype: 'int32'>
inputs[0] :  tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0] :  tf.Tensor(0, shape=(), dtype=int32)


In [55]:
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation = 'sigmoid')(x)
    model = keras.Model(inputs, outputs)

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [56]:
model = get_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [59]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_lgram.h5', save_best_only=True)
]
# 데이터셋 캐싱 : 데이터 셋을 캐싱하면 디스크 I/O를 줄이고, 학습 속도를 높일 수 있다.
# 데이터를 메모리에 저장하여 epoch가 빠르게 접근할 수 있도록 처리한다.
model.fit(binary_lgram_train_ds.cache(), validation_data=binary_lgram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
 53/625 [=>............................] - ETA: 1s - loss: 0.2949 - accuracy: 0.8833

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x280fd8bb0>

In [61]:
model = keras.models.load_model('binary_lgram.h5')
print(f'테스트 정확도 : {model.evaluate(binary_lgram_test_ds)[1]:.3f}')

테스트 정확도 : 0.889


## 바이그램(Bigram)
: 연속된 두 단어 쌍

#### * 이진 인코딩 : 바이그램이 존재하면 1, 존재하지 않으면 0

In [62]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='multi_hot',
)

In [None]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_paralled_calls = 4)

binary_2gram_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_paralled_calls = 4)

binary_2gram_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4)

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_2gram.h5', save_best_only=True)
]

model.fit(binary_2gram_train_ds.cache(), validation_data = binary_2gram_cal_ds.chache(), epochs=10, callbacks=callbacks)

In [None]:
model = keras.models.load_model('binary_2gram.h5')


# TF-IDF 벡터라이저

In [69]:
# 토큰 카운트 반환
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='count' # 단어의 반도기반 벡터화
)

In [70]:
# TF-IDF 가중치가 적용된 출력 반환
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='tf_idf' # TF-IDF 기반 벡터화
)

In [71]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(
    lambda x, y : (text_vectorization(x), y),
    num_parallel_calls = 4)

tfidf_2gram_val_ds = val_ds.map(
    lambda x, y, : (text_vectorization(x), y),
    num_parallel_calls = 4)

tfidf_2gram_test_ds = test_ds.map(
    lambda x, y, : (text_vectorization(x), y),
    num_parallel_calls = 4)

In [74]:
model = get_model()

callbacks = [
    keras.callbacks.ModelCheckpoint('tfidf_2gram.h5', save_best_only=True)

]

model.fit(tfidf_2gram_train_ds.cache(), validation_data = tfidf_2gram_val_ds.cache(), epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
 66/625 [==>...........................] - ETA: 1s - loss: 0.3537 - accuracy: 0.8546

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x282933af0>

In [75]:
model = keras.models.load_model('tfidf_2gram.h5')
print(f'테스트 정확도 : {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}')

테스트 정확도 : 0.891


In [79]:
inputs = keras.Input(shape=(1,), dtype='string')
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)

In [80]:
raw_text_data = tf.convert_to_tensor([['That was an excellent moive, I love it.']])

predictions = inference_model(raw_text_data)
print(f'긍정적 리뷰일 확률 : {float(predictions[0] * 100):.2f} %')

긍정적 리뷰일 확률 : 67.41 %
