In [13]:
import pickle
from string import digits

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras.layers import (LSTM, Conv1D, Dense, Dropout, Embedding, Flatten,
                          Input, MaxPool1D, Reshape, concatenate)
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

%matplotlib inline

In [14]:
data_train = pd.read_csv("vlsp_sentiment_train.csv", sep="\t")
data_train.columns = ["Class", "Data"]
data_test = pd.read_csv("vlsp_sentiment_test.csv", sep="\t")
data_test.columns = ["Class", "Data"]

In [15]:
print(data_train.shape)
print(data_test.shape)

(5100, 2)
(1050, 2)


In [16]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

In [17]:
encoded_labels = []

for label in labels:
    if label == -1:
        encoded_labels.append([1, 0, 0])
    elif label == 0:
        encoded_labels.append([0, 1, 0])
    else:
        encoded_labels.append([0, 0, 1])

encoded_labels = np.array(encoded_labels)

In [18]:
reviews_processed = []
unlabeled_processed = []
for review in reviews:
    review_cool_one = "".join([char for char in review if char not in digits])
    reviews_processed.append(review_cool_one)

In [19]:
# Use PyVi for Vietnamese word tokenizer
word_reviews = []
all_words = []
for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review.split())

In [20]:
EMBEDDING_DIM = 1536  # OpenAI embedding dimension
MAX_VOCAB_SIZE = (
    10000  # how many unique words to use (i.e num rows in embedding vector)
)
MAX_SEQUENCE_LENGTH = 300  # max number of words in a comment to use

In [21]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index

In [22]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [23]:
print("Shape of X train and X validation tensor:", data.shape)
print("Shape of label train and validation tensor:", labels.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


In [24]:
vocabulary_size = min(len(word_index) + 1, MAX_VOCAB_SIZE)

# Load the embedding matrix
with open("embedding_matrix.pickle", "rb") as handle:
    embedding_matrix = pickle.load(handle)

In [25]:
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM)
embedding_layer.trainable = True
embedding_layer.set_weights = [embedding_matrix]

In [26]:
sequence_length = data.shape[1]
filter_sizes = [3, 4, 5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

################## CRNN ####################################
reshape = Reshape((sequence_length, EMBEDDING_DIM))(embedding)

conv_0 = Conv1D(
    num_filters,
    (filter_sizes[0],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(reshape)
conv_1 = Conv1D(
    num_filters,
    (filter_sizes[1],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(reshape)
conv_2 = Conv1D(
    num_filters,
    (filter_sizes[2],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(reshape)

conv_0 = MaxPool1D(300)(conv_0)
conv_1 = MaxPool1D(300)(conv_1)
conv_2 = MaxPool1D(300)(conv_2)
# Reshape output to match RNN dimension
conv_0 = Reshape((-1, num_filters))(conv_0)
conv_1 = Reshape((-1, num_filters))(conv_1)
conv_2 = Reshape((-1, num_filters))(conv_2)

concat = concatenate([conv_0, conv_1, conv_2])

# Add a SimpleRNN layer
rnn = SimpleRNN(100)(concat)

dropout = Dropout(drop)(rnn)
output = Dense(
    units=3, activation="softmax", kernel_regularizer=regularizers.l2(0.01)
)(dropout)

# this creates a model that includes
model = Model(inputs, output)

adam = Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0
)
model.compile(
    loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"]
)
model.summary()



In [27]:
# define callbacks
early_stopping = EarlyStopping(
    monitor="val_loss", min_delta=0.01, patience=4, verbose=1
)
callbacks_list = [early_stopping]

model.fit(
    data,
    labels,
    validation_split=0.2,
    epochs=10,
    batch_size=256,
    callbacks=callbacks_list,
    shuffle=True,
)

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 4s/step - accuracy: 0.4532 - loss: 5.2077 - val_accuracy: 0.0000e+00 - val_loss: 2.7616
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 4s/step - accuracy: 0.6508 - loss: 1.5546 - val_accuracy: 0.0059 - val_loss: 1.8238
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 4s/step - accuracy: 0.7503 - loss: 0.8418 - val_accuracy: 0.0402 - val_loss: 2.0893
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 4s/step - accuracy: 0.8605 - loss: 0.6282 - val_accuracy: 0.0745 - val_loss: 2.2528
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 4s/step - accuracy: 0.9352 - loss: 0.4683 - val_accuracy: 0.0794 - val_loss: 2.4820
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 4s/step - accuracy: 0.9753 - loss: 0.3522 - val_accuracy: 0.0853 - val_loss: 2.8471
Epoch 6: early stopping


<keras.src.callbacks.history.History at 0x7efe26f1ce50>

In [28]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

In [29]:
encoded_labels_test = []

for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1, 0, 0])
    elif label_test == 0:
        encoded_labels_test.append([0, 1, 0])
    else:
        encoded_labels_test.append([0, 0, 1])

encoded_labels_test = np.array(encoded_labels_test)

In [30]:
reviews_processed_test = []
unlabeled_processed_test = []
for review_test in reviews_test:
    review_cool_one = "".join(
        [char for char in review_test if char not in digits]
    )
    reviews_processed_test.append(review_cool_one)

In [31]:
# Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [32]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [33]:
print("Shape of X train and X validation tensor:", data_test.shape)
print("Shape of label train and validation tensor:", labels_test.shape)

Shape of X train and X validation tensor: (1050, 300)
Shape of label train and validation tensor: (1050, 3)


In [34]:
score = model.evaluate(data_test, labels_test)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 134ms/step - accuracy: 0.7642 - loss: 0.8568


In [35]:
print("%s: %.2f" % (model.metrics_names[0], score[0]))
print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))

loss: 1.25
compile_metrics: 63.24%
