In [1]:
import os
import pandas as pd
import numpy as np

from pyvi.ViTokenizer import ViTokenizer #vietnamese tokenizer

import re #regular expression

import pickle
from keras_preprocessing.sequence import pad_sequences

from tensorflow import keras
from tensorflow.keras.preprocessing import text, sequence 

In [2]:
DIR_DATASET = os.path.join('dataset')

EMBEDDING_PATH = os.path.join(DIR_DATASET, 'cc.vi.300.vec')
TOKENIZER_PATH = os.path.join(DIR_DATASET, 'tokenizer.pickle')
VISTOPWORDS_PATH = os.path.join(DIR_DATASET, "vietnamese-stopwords.txt")

MODEL_DIR = os.path.join('model')

In [3]:
DIR_DATASET = os.path.join('dataset')

EMBEDDING_PATH = os.path.join(DIR_DATASET, 'cc.vi.300.vec')
TOKENIZER_PATH = os.path.join(DIR_DATASET, 'tokenizer.pickle')
VISTOPWORDS_PATH = os.path.join(DIR_DATASET, "vietnamese-stopwords.txt")

MODEL_DIR = os.path.join('model')

In [4]:
# Load the pre-trained model
CNNS_PATH = os.path.join(MODEL_DIR, 'CNNs.keras')

textCNNModel = keras.models.load_model(CNNS_PATH)

LSTM_PATH = os.path.join(MODEL_DIR, 'LSTM.keras')

LSTM = keras.models.load_model(LSTM_PATH)

Prepare

In [5]:
with open(VISTOPWORDS_PATH, "r", encoding="utf-8") as ins:
    stopwords = []
    for line in ins:
        dd = line.strip('\n')
        stopwords.append(dd)
    stopwords = set(stopwords)

In [6]:
PATH_TEST_V1 = os.path.join(DIR_DATASET, 'core-data\\test_ver1.csv')
data_test_v1 = pd.read_csv(PATH_TEST_V1)

In [7]:
data_test_v1.shape

(3974, 11)

In [8]:
data_test_v1.columns

Index(['link', 'rating', 'comment', 'categories', 'category', 'product_name',
       'description', 'num_sold', 'num_review', 'label', 'spam_label'],
      dtype='object')

In [9]:
data_test_v1=data_test_v1.drop(['link', 'rating', 'categories', 'category', 'product_name',
       'description', 'num_sold', 'num_review', 'spam_label'], axis=1)

In [10]:
data_test_v1=data_test_v1.drop_duplicates(keep='first')

In [11]:
data_test_v1.duplicated().sum()

0

In [12]:
data_test_v1.shape

(3971, 2)

In [13]:
# filder stop word
def filter_stop_words(train_sentences, stop_words):
    new_sent = [word for word in train_sentences.split() if word not in stop_words]
    train_sentences = ' '.join(new_sent)
    return train_sentences

# remove emoji
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)
# declare func and adding some remove
def preprocess(text, tokenized = True, lowercased = True):
    text = ViTokenizer.tokenize(text) if tokenized else text
    text = filter_stop_words(text, stopwords)
    text = deEmojify(text)
    text = text.lower() if lowercased else text

    text = text.strip()
    text = re.compile('<.*?>').sub('', text)
    text = re.sub('\s+', ' ', text) 
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text

def pre_process_df(X, tokenized=True, lowercased=True):
    X = np.array(X)
    to_delete_indices = []
    
    # Preprocess and track indices of empty elements
    for idx, ele in enumerate(X):
        processed_ele = preprocess(str(ele), tokenized=tokenized, lowercased=lowercased)
        if not processed_ele:
            to_delete_indices.append(idx)
        X[idx] = processed_ele
    
    # Remove empty elements
    X = np.delete(X, to_delete_indices)
    
    return X, to_delete_indices

In [14]:
X_v1 = data_test_v1.iloc[:, 0:1]
y_v1 = data_test_v1.iloc[:, 1:2]

In [15]:
embeddings_index = {}
with open(EMBEDDING_PATH, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [16]:
v1_X, to_delete_indices = pre_process_df(X_v1['comment'], tokenized=True, lowercased=True)

# Remove the rows in the original DataFrame
data_test_v1 = data_test_v1.drop(index=to_delete_indices).reset_index(drop=True)

In [17]:
sequence_length = 100
tokenizer = text.Tokenizer(lower=False, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~\t\n')

# load internal vocabulary
train_X = np.load('dataset/internal-vocabulary.npy')

tokenizer.fit_on_texts(train_X)

with open(TOKENIZER_PATH, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

def make_featues(X, tokenizer, is_one_hot_label=False, number_class1=2): # my model just binary (non-spam or not spam)
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=sequence_length)
    if is_one_hot_label: 
        y = to_categorical(y, num_classes=number_class1)
    return X

In [18]:
v1_XX = make_featues(v1_X, tokenizer)
v1_XX[3]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,   23, 1870,   72, 1071, 3030,  348,    5,  328,  878,   41,
        338, 2710,   13,  137,  242, 2228,  945, 1870,   53, 1202,  182,
        485,   53,  227,  460,  945,  528,  606,   18,   14,  460,   71,
          3])

In [19]:
v1_XX[4]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    1,
        281,   40,   96,    8,  671,  255,  223,   13, 1495, 1708,  178,
         14])

In [20]:
# Make predictions
predictions = textCNNModel.predict(v1_XX)

predictions

[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


array([[8.9397060e-04],
       [1.0195284e-06],
       [9.5824361e-01],
       ...,
       [1.7175148e-11],
       [1.9218168e-01],
       [9.8872846e-01]], dtype=float32)

In [21]:
predictions.shape

(3962, 1)

In [22]:
data_test_v1.shape

(3962, 2)

In [23]:
data_test_v1['predicted_label'] = (predictions > 0.5).astype(int)

In [24]:
data_test_v1

Unnamed: 0,comment,label,predicted_label
0,Shop giao hàng khá nhanh. Chất liệu cũng được ...,0,0
1,cát khá thơm\r\nnhà mình nuôi 6 bé mà k bị thú...,0,0
2,Hxgsydùọgdyhcgstdofueycidyxcjckblgidudufkvjfjc...,1,1
3,Mình đã dùng nhiều loại kcn tầm trung nhưng ph...,0,0
4,Dsax nhận được hàng. Test thử vài hôm thấy ok....,0,0
...,...,...,...
3957,Sản phẩm OK rất đẹp !cảm ơn Shop Sp rẻ mà rất ...,0,0
3958,Rêu được lấy từ rừng về đã rửa sạch đất cát rồ...,1,0
3959,"Hình ảnh k liên quan lắm :v đai chắc chắn, nhỏ...",0,0
3960,"sản phẩm nhìn bên ngoài quá ngầu , shop giao n...",0,0


In [25]:
# Calculate accuracy
accuracy = (data_test_v1['label'] == data_test_v1['predicted_label']).mean()

print(f"Model accuracy: {accuracy:.2%}")

Model accuracy: 81.65%
