In [2]:
from string import digits

import numpy as np
import pandas as pd
from gensim.models.keyedvectors import KeyedVectors
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras.layers import (LSTM, Conv1D, Dense, Dropout, Embedding, Flatten,
                          Input, MaxPool1D, Reshape, concatenate)
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

%matplotlib inline

In [3]:
data_train = pd.read_csv("vlsp_sentiment_train.csv", sep="\t")
data_train.columns = ["Class", "Data"]
data_test = pd.read_csv("vlsp_sentiment_test.csv", sep="\t")
data_test.columns = ["Class", "Data"]

In [4]:
print(data_train.shape)
print(data_test.shape)

(5100, 2)
(1050, 2)


In [5]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

In [6]:
encoded_labels = []

for label in labels:
    if label == -1:
        encoded_labels.append([1, 0, 0])
    elif label == 0:
        encoded_labels.append([0, 1, 0])
    else:
        encoded_labels.append([0, 0, 1])

encoded_labels = np.array(encoded_labels)

In [7]:
reviews_processed = []
unlabeled_processed = []
for review in reviews:
    review_cool_one = "".join([char for char in review if char not in digits])
    reviews_processed.append(review_cool_one)

In [8]:
# Use PyVi for Vietnamese word tokenizer
word_reviews = []
all_words = []
for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review.split())

In [9]:
EMBEDDING_DIM = 400  # how big is each word vector
MAX_VOCAB_SIZE = (
    10000  # how many unique words to use (i.e num rows in embedding vector)
)
MAX_SEQUENCE_LENGTH = 300  # max number of words in a comment to use

In [10]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index

In [11]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [12]:
print("Shape of X train and X validation tensor:", data.shape)
print("Shape of label train and validation tensor:", labels.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


In [13]:
word_vectors = KeyedVectors.load_word2vec_format(
    "vi-model-CBOW.bin", binary=True
)

vocabulary_size = min(len(word_index) + 1, MAX_VOCAB_SIZE)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
print("vocab size", vocabulary_size)
for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

del word_vectors

vocab size 7919


In [14]:
embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM)
embedding_layer.trainable = True
embedding_layer.set_weights = [embedding_matrix]

In [15]:
sequence_length = data.shape[1]
filter_sizes = [3, 4, 5]
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)

################## LSTM + CNN ###############################
reshape = Reshape((sequence_length, EMBEDDING_DIM))(embedding)

lstm_2 = LSTM(1024, return_sequences=True)(reshape)
lstm_1 = LSTM(512, return_sequences=True)(lstm_2)

conv_0 = Conv1D(
    num_filters,
    (filter_sizes[0],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(lstm_1)
conv_1 = Conv1D(
    num_filters,
    (filter_sizes[1],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(lstm_1)
conv_2 = Conv1D(
    num_filters,
    (filter_sizes[2],),
    padding="same",
    activation="relu",
    kernel_regularizer=regularizers.l2(0.01),
)(lstm_1)

conv_0 = MaxPool1D(pool_size=2)(conv_0)
conv_1 = MaxPool1D(pool_size=2)(conv_1)
conv_2 = MaxPool1D(pool_size=2)(conv_2)

concat = concatenate([conv_0, conv_1, conv_2])

lstm_0 = LSTM(256)(concat)

dropout = Dropout(drop)(lstm_0)
output = Dense(
    units=3, activation="softmax", kernel_regularizer=regularizers.l2(0.01)
)(dropout)

# this creates a model that includes
model = Model(inputs, output)

adam = Adam(
    learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0
)
model.compile(
    loss="categorical_crossentropy", optimizer=adam, metrics=["accuracy"]
)
model.summary()



In [16]:
# define callbacks
early_stopping = EarlyStopping(
    monitor="val_loss", min_delta=0.01, patience=4, verbose=1
)
callbacks_list = [early_stopping]

model.fit(
    data,
    labels,
    validation_split=0.2,
    epochs=10,
    batch_size=256,
    callbacks=callbacks_list,
    shuffle=True,
)

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 6s/step - accuracy: 0.4257 - loss: 5.3311 - val_accuracy: 0.0000e+00 - val_loss: 4.7445
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 6s/step - accuracy: 0.4226 - loss: 2.7999 - val_accuracy: 0.0000e+00 - val_loss: 2.3884
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 6s/step - accuracy: 0.4163 - loss: 1.7154 - val_accuracy: 0.0000e+00 - val_loss: 1.9784
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 6s/step - accuracy: 0.4587 - loss: 1.2576 - val_accuracy: 0.0000e+00 - val_loss: 1.9511
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 6s/step - accuracy: 0.5968 - loss: 1.0221 - val_accuracy: 0.0000e+00 - val_loss: 1.8798
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 6s/step - accuracy: 0.6495 - loss: 0.8766 - val_accuracy: 0.0863 - val_loss: 2.1754
Epoch 7/10
[1m16/1

<keras.src.callbacks.history.History at 0x7fc0c06b7ee0>

In [17]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

In [18]:
encoded_labels_test = []

for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1, 0, 0])
    elif label_test == 0:
        encoded_labels_test.append([0, 1, 0])
    else:
        encoded_labels_test.append([0, 0, 1])

encoded_labels_test = np.array(encoded_labels_test)

In [19]:
reviews_processed_test = []
unlabeled_processed_test = []
for review_test in reviews_test:
    review_cool_one = "".join(
        [char for char in review_test if char not in digits]
    )
    reviews_processed_test.append(review_cool_one)

In [20]:
# Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [21]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [22]:
print("Shape of X train and X validation tensor:", data_test.shape)
print("Shape of label train and validation tensor:", labels_test.shape)

Shape of X train and X validation tensor: (1050, 300)
Shape of label train and validation tensor: (1050, 3)


In [23]:
score = model.evaluate(data_test, labels_test)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 683ms/step - accuracy: 0.7049 - loss: 0.9166


In [25]:
print("%s: %.2f" % (model.metrics_names[0], score[0]))
print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))

loss: 1.35
compile_metrics: 60.48%
