In [1]:
from string import digits

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from keras.layers import Embedding
from pyvi import ViTokenizer
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Conv1D,
    Dense,
    Dropout,
    Embedding,
    Flatten,
    Input,
    MaxPooling1D,
    Reshape,
    concatenate,
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

%matplotlib inline

2024-04-07 15:24:56.941370: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-07 15:24:56.945487: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-07 15:24:56.992741: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_train = pd.read_csv("vlsp_sentiment_train.csv", sep="\t")
data_train.columns = ["Class", "Data"]
data_test = pd.read_csv("vlsp_sentiment_test.csv", sep="\t")
data_test.columns = ["Class", "Data"]

In [3]:
print(data_train.shape)
print(data_test.shape)

(5100, 2)
(1050, 2)


In [4]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

In [5]:
encoded_labels = []

for label in labels:
    if label == -1:
        encoded_labels.append([1, 0, 0])
    elif label == 0:
        encoded_labels.append([0, 1, 0])
    else:
        encoded_labels.append([0, 0, 1])

encoded_labels = np.array(encoded_labels)

In [6]:
reviews_processed = []
unlabeled_processed = []
for review in reviews:
    review_cool_one = "".join([char for char in review if char not in digits])
    reviews_processed.append(review_cool_one)

In [7]:
# Use PyVi for Vietnamese word tokenizer
word_reviews = []
all_words = []
for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review.split())

In [8]:
EMBEDDING_DIM = 400  # how big is each word vector
MAX_VOCAB_SIZE = (
    10000  # how many unique words to use (i.e num rows in embedding vector)
)
MAX_SEQUENCE_LENGTH = 300  # max number of words in a comment to use

In [9]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index

In [10]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [11]:
print("Shape of X train and X validation tensor:", data.shape)
print("Shape of label train and validation tensor:", labels.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


In [12]:
word_vectors = KeyedVectors.load_word2vec_format(
    "vi-model-CBOW.bin",
    binary=True,
)

vocabulary_size = min(len(word_index) + 1, MAX_VOCAB_SIZE)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

del word_vectors

In [13]:
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM)
embedding_layer.trainable = True
embedding_layer.set_weights = [embedding_matrix]

In [25]:
sequence_length = data.shape[1]
filter_sizes = [3, 4, 5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

conv_0 = Conv1D(
    num_filters,
    filter_sizes[0],
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(embedding)
conv_1 = Conv1D(
    num_filters,
    filter_sizes[1],
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(embedding)
conv_2 = Conv1D(
    num_filters,
    filter_sizes[2],
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(embedding)

maxpool_0 = MaxPooling1D(sequence_length - filter_sizes[0] + 1, strides=1)(
    conv_0
)
maxpool_1 = MaxPooling1D(sequence_length - filter_sizes[1] + 1, strides=1)(
    conv_1
)
maxpool_2 = MaxPooling1D(sequence_length - filter_sizes[2] + 1, strides=1)(
    conv_2
)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3 * num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(
    units=3, activation="softmax", kernel_regularizer=regularizers.l2(0.01)
)(dropout)

# This creates a model that includes
model = Model(inputs, output)

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(
    loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"]
)
model.summary()

# Define callbacks
early_stopping = EarlyStopping(
    monitor="val_loss", min_delta=0.01, patience=4, verbose=1
)
callbacks_list = [early_stopping]

In [15]:
model.fit(
    data,
    labels,
    validation_split=0.2,
    epochs=5,
    batch_size=256,
    callbacks=callbacks_list,
    shuffle=True,
)

Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 904ms/step - accuracy: 0.4172 - loss: 5.0566 - val_accuracy: 0.0000e+00 - val_loss: 3.7620
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 874ms/step - accuracy: 0.5895 - loss: 2.4886 - val_accuracy: 0.0000e+00 - val_loss: 2.3524
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 882ms/step - accuracy: 0.6645 - loss: 1.4335 - val_accuracy: 0.0000e+00 - val_loss: 1.9371
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 882ms/step - accuracy: 0.6526 - loss: 1.0619 - val_accuracy: 0.0000e+00 - val_loss: 1.8502
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 878ms/step - accuracy: 0.6974 - loss: 0.8909 - val_accuracy: 0.0000e+00 - val_loss: 1.7221


<keras.src.callbacks.history.History at 0x7f8b795f31c0>

In [16]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

In [17]:
encoded_labels_test = []

for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1, 0, 0])
    elif label_test == 0:
        encoded_labels_test.append([0, 1, 0])
    else:
        encoded_labels_test.append([0, 0, 1])

encoded_labels_test = np.array(encoded_labels_test)

In [18]:
reviews_processed_test = []
unlabeled_processed_test = []
for review_test in reviews_test:
    review_cool_one = "".join(
        [char for char in review_test if char not in digits]
    )
    reviews_processed_test.append(review_cool_one)

In [19]:
# Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [20]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [21]:
print("Shape of X train and X validation tensor:", data_test.shape)
print("Shape of label train and validation tensor:", labels_test.shape)

Shape of X train and X validation tensor: (1050, 300)
Shape of label train and validation tensor: (1050, 3)


In [22]:
score = model.evaluate(data_test, labels_test)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.7384 - loss: 0.8569


In [23]:
print("%s: %.2f%%" % (model.metrics_names[0], score[0] * 100))
print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))

loss: 105.85%
compile_metrics: 53.24%
