In [1]:
!pip install tensorflow==2.20.0



## 1. Load dữ liệu

In [2]:
import json
import pathlib
import urllib.request
import tempfile

url = "https://storage.googleapis.com/learning-datasets/sarcasm.json"

# dùng thư mục tạm phù hợp với hệ điều hành
data_path = pathlib.Path(tempfile.gettempdir()) / "sarcasm.json"

if not data_path.exists():
    urllib.request.urlretrieve(url, data_path)

with data_path.open("r", encoding="utf-8") as f:
    datastore = json.load(f)

print(len(datastore))
print(datastore[0])

26709
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


In [3]:
# https://storage.googleapis.com/learning-datasets/sarcasm.jsondatastore[0]

XỬ LÝ DỮ LIỆU

In [4]:
dataset = []
label_dataset = []

for item in datastore:
    dataset.append(item["headline"])
    label_dataset.append(item["is_sarcastic"])

In [5]:
import numpy as np

dataset = np.array(dataset)
label_dataset = np.array(label_dataset)

In [6]:
dataset[:10], label_dataset[:10]

(array(["former versace store clerk sues over secret 'black code' for minority shoppers",
        "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
        "mom starting to fear son's web series closest thing she will have to grandchild",
        'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
        'j.k. rowling wishes snape happy birthday in the most magical way',
        "advancing the world's women",
        'the fascinating case for eating lab-grown meat',
        'this ceo will send your kids to school, if you work for his company',
        'top snake handler leaves sinking huckabee campaign',
        "friday's morning email: inside trump's presser for the ages"],
       dtype='<U254'),
 array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0]))

In [7]:
dataset

array(["former versace store clerk sues over secret 'black code' for minority shoppers",
       "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
       "mom starting to fear son's web series closest thing she will have to grandchild",
       ..., 'reparations and obama',
       'israeli ban targeting boycott supporters raises alarm abroad',
       'gourmet gifts for the foodie 2014'], shape=(26709,), dtype='<U254')

Chia dữ liệu

In [8]:
train_size = 0.8
size = int(len(dataset) * train_size)

train_sentence = dataset[:size]
test_sentence = dataset[size:]

train_label = label_dataset[:size]
test_label = label_dataset[size:]

In [9]:
len(train_sentence), len(test_sentence)

(21367, 5342)

### 2.1. Train_sequences

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
vocab_size = 3000

In [12]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentence)

In [13]:
vocab_size = len(tokenizer.word_index)
embedding_size = 64
max_length = 25

In [14]:
vocab_size

26507

In [15]:
train_sequences = tokenizer.texts_to_sequences(train_sentence)

In [16]:
train_sentence[1]

np.str_("the 'roseanne' revival catches up to our thorny political mood, for better and worse")

In [17]:
train_sequences[1]

[4, 1, 2989, 2990, 22, 2, 154, 1, 388, 2751, 6, 265, 9, 965]

### Chunking

In [18]:
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_length, truncating="post", padding="post")

In [19]:
max_length

25

In [20]:
padded_train_sequences[2:4]

array([[ 156,  924,    2,  865, 1530, 2097,  599,    1,  220,  135,   39,
          45,    2,    1,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0],
       [1352,   37,  218,  382,    2, 1680,   29,  294,   22,   10, 2359,
        1416,    1, 1004,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]], dtype=int32)

In [21]:
padded_train_sequences.shape

(21367, 25)

### 2.2. Test_sequences

In [22]:
test_sequences = tokenizer.texts_to_sequences(test_sentence)

In [23]:
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length, truncating="post", padding="post")

In [24]:
padded_test_sequences.shape

(5342, 25)

Custom Model LSTM

In [25]:
import tensorflow as tf

In [26]:
class LSTM(tf.keras.layers.Layer):
  def __init__(self, units, inp_shape):
    super(LSTM, self).__init__()
    self.units = units
    self.inp_shape = inp_shape
    # hidden_size: h size, c size -> 128
    # embedding_size: 300
    # W: 128, 300
    self.W = self.add_weight(name="W", shape=(4, self.units, self.inp_shape))
    # U: 128, 128
    self.U = self.add_weight(name="U", shape=(4, self.units, self.units))


  def call(self, pre_layer, x):
    pre_h, pre_c = tf.unstack(pre_layer)

    # Cổng kiểm soát đầu vào: Input Gate

    i_t = tf.nn.sigmoid(
        tf.matmul(x, tf.transpose(self.W[0])) +
        tf.matmul( pre_h, tf.transpose(self.U[0]))
    )

    # Cổng kiểm soát số lượng dữ liệu giữ lại/quên đi: Forget Gate
    f_t = tf.nn.sigmoid(
        tf.matmul(x, tf.transpose(self.W[1]))
        + tf.matmul( pre_h, tf.transpose(self.U[1]))
        )

    # Cổng kiểm soát dữ liệu đầu ra: Output Gate
    o_t = tf.nn.sigmoid(tf.matmul(x, tf.transpose(self.W[2])) + tf.matmul( pre_h, tf.transpose(self.U[2])))

    # Đây giống SimpleRNN và được coi là thông tin mới (bộ nhớ mới)

    n_c_t = tf.nn.tanh(tf.matmul(x, tf.transpose(self.W[3])) + tf.matmul( pre_h, tf.transpose(self.U[3])))

    # Kết hợp việc giữ lại thông tin + bổ sung thêm thông tin mới

    c = tf.multiply(f_t, pre_c) + tf.multiply(i_t, n_c_t)

    # Cho phép bao nhiêu thông tin thoát khỏi cell

    h = tf.multiply(o_t, tf.nn.tanh(c))

    return tf.stack([h, c])


In [27]:
import tensorflow as tf
import numpy as np

# Initialize the LSTM layer
units = 128
input_shape = 300
lstm_layer = LSTM(units, input_shape)

# Create sample input data
batch_size = 32
sequence_length = 10  # Number of time steps in the input
x = tf.random.normal((batch_size, input_shape))  # Sample input data

# Create initial hidden and cell state
initial_h = tf.random.normal((batch_size, units))  # Initial hidden state
initial_c = tf.random.normal((batch_size, units))  # Initial cell state
initial_state = tf.stack([initial_h, initial_c])   # Combine the states

# Call the LSTM layer
output = lstm_layer(initial_state, x)

# Display the output shape
print("Output shape:", output.shape)


Output shape: (2, 32, 128)


## 3. Xây dựng mô hình

In [28]:
import numpy as np

In [29]:
class Bidirectional(tf.keras.Model):
  def __init__(self, lstm1, lstm2):
    super(Bidirectional, self).__init__()
    self.lstm1 = lstm1
    self.lstm2 = lstm2

  def call(self):
    # Left to right


    # Right to left
    pass

In [30]:
class ProtonXRNN(tf.keras.Model):
  def __init__(self, units, embedding_size, vocab_size, input_length):
    super(ProtonXRNN, self).__init__()
    self.input_length = input_length
    self.units = units

    # Embedding để chuyển từ thành vector
    self.embedding = tf.keras.layers.Embedding(
      vocab_size,
      embedding_size,
      input_length = input_length
    )

    # Sử dụng cell LSTM đã lập trình bên trên
    self.lstm = LSTM(units, embedding_size)

    # Sau đó đưa lịch sử h của LSTM qua mạng nơ ron đơn giản
    self.classfication_model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_shape=(units,), activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])


  def call(self, sentence):
    """
    Parameters:
    sentence:
      Dạng: Tensor
      Miêu tả: Câu
      Chiều: (batch_size, input_length)
    out:
      Dạng: Tensor
      Miêu tả: Đầu ra của mô hình dự đoán
      Chiều: (batch_size, 1)
    """

    batch_size = tf.shape(sentence)[0]

    # Khởi tạo (hidden_state và context_state)

    pre_layer = tf.stack([
      tf.zeros([batch_size, self.units]),
      tf.zeros([batch_size, self.units])
    ])

    # Đưa câu qua Embedding để lấy các vector
    # embedded_sentence: (batch_size, input_length, embedding_size)
    embedded_sentence = self.embedding(sentence)

    # Đưa tuần tự từng từ qua LSTM + (hidden_state và cell_state)
    # lớp phía trước để thu được (hidden_state và cell_state) hiện tại
    for i in range(self.input_length):
      # : đầu tiên: Lấy batch_size
      # i Vị trí từ
      # : cuối cùng: Lấy embedding.
      # (batch_size, embedding_size)
      word = embedded_sentence[:, i, :]
      pre_layer = self.lstm(pre_layer, word)


    h, _ = tf.unstack(pre_layer)

    # Sử dụng hidden_state cuối cùng cho việc dự đoán
    return self.classfication_model(h)

units = 128
embedding_size = 100
vocab_size = len(tokenizer.index_word) + 1
input_length = max_length

# Khởi tạo đối tượng protonxrnn
protonxrnn = ProtonXRNN(units, embedding_size, vocab_size, input_length)

protonxrnn.compile(
    tf.keras.optimizers.Adam(0.0005) , loss='binary_crossentropy', metrics=['acc']
)

# Tiến hành training

protonxrnn(padded_train_sequences[10:13]).shape

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


TensorShape([3, 1])

## 4. Tiến hành training

In [31]:
protonxrnn.fit(padded_train_sequences, train_label, validation_data=(padded_test_sequences, test_label) ,batch_size=32, epochs=10)

Epoch 1/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 116ms/step - acc: 0.6962 - loss: 0.5414 - val_acc: 0.8282 - val_loss: 0.3846
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 69ms/step - acc: 0.8597 - loss: 0.3292 - val_acc: 0.8409 - val_loss: 0.3572
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 53ms/step - acc: 0.8787 - loss: 0.2902 - val_acc: 0.8409 - val_loss: 0.3528
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 50ms/step - acc: 0.8840 - loss: 0.2733 - val_acc: 0.8381 - val_loss: 0.3696
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 48ms/step - acc: 0.8901 - loss: 0.2592 - val_acc: 0.8371 - val_loss: 0.3824
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 49ms/step - acc: 0.8960 - loss: 0.2430 - val_acc: 0.8377 - val_loss: 0.3998
Epoch 7/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s

<keras.src.callbacks.history.History at 0x1b6d3942cf0>