# Import libraries

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os.path

import string
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

from PIL import Image

from datetime import datetime

from pythainlp import word_vector
from pythainlp.tokenize import word_tokenize

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from keras.initializers import Constant
from keras.models import Sequential, load_model
from keras.layers import Flatten, Dense, GRU, Bidirectional, LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D
from keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import NearestNeighbors

Using TensorFlow backend.


# Text classification model
- Use _pythainlp_ to tokenize Thai words.
- Use _word_vector_ from _pythainlp_ to vectorize Thai words (similar to _Word2vec_ ).
- The description texts are converted to sequences in the shape of (dataset_length, max_description_length + 1).
- The sequences are then feeded into the Embedding layer and CNN.
- As far as I can do the experiments, bidirectional LSTM performs better than CNN, unidirectional LSTM, and bidirectional GRU. Their average accuracies are lower than bidirectional LSTM.
- LSTM is suitable for handling input sequence, and bidirectional version help read the context from both forward direction and backward direction.
- Adding more Dense layers do not increase accuracy. Only 8-unit Dense layer is enough.
- This is a binary classification model; therefore, the ouput node contains the probability that the promotion is spam.

- Load a model to vectorize Thai words.

In [17]:
model_word = word_vector.get_model() # get a model to vectorize Thai words

- Read the column _description_ of both training & testing dataset.

In [18]:
x_text_train = df_train['description'].values # read a column to train the texts
x_text_df_test = df_test['description'].values # also include test texts to fit on the same tokenizer

- Tokenize the description texts. The final result is a list of sentences. Each sentence is a list of words (tokens).
- Clean the tokens, eliminate punctuation, extra space, '\n', and empty string.

In [19]:
tokens_lines = list() # 2D list containing word tokens
for line in np.concatenate((x_text_train, x_text_df_test)):
    tokens = word_tokenize(line)
    table  = str.maketrans('', '', string.punctuation + '“”\n ')
    tokens = [token.translate(table) for token in tokens]
    tokens = [token for token in tokens if len(token)]
    tokens_lines.append(tokens)

- Define the length of training dataset, the maximum sentence length, and the dimension of word vector.

In [20]:
token_train_len = len(x_text_train)# length of training dataset
token_max_len = int(np.quantile([len(tokens) for tokens in tokens_lines[:token_train_len]], 1)) + 1 # description text
embed_dim = len(model_word[model_word.index2word[0]]) # a vector has dimention = 300

- Create sequences (list of sentences that each token is substituted by word index number).
- Split the training and testing set by the ratio 80:20.

In [21]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(tokens_lines[:token_train_len]) # create word indexes (number)

# create sequences of training texts (substitute each word in tokens_lines by the number)
sequences = tokenizer_obj.texts_to_sequences(tokens_lines[:token_train_len])

word_index = tokenizer_obj.word_index
num_words = len(word_index) + 1
print('%s unique tokens (including an unknown token)' % num_words)

x_text_train = pad_sequences(sequences, maxlen=token_max_len, padding='pre')

# split into train & test
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(x_text_train,
                                                                        df_train['bad'],
                                                                        test_size=0.2)

5002 unique tokens (including an unknown token)


- Create a matrix of word vectors used to initialize the embedding layer.

In [22]:
embedding_matrix = np.zeros((num_words, embed_dim))

for word, i in word_index.items():
    if i > num_words:
        continue
    if word in model_word:
        embedding_matrix[i] = model_word[word]

#### Now the data is ready to be trained
- x is in the shape of (split_ratio*dataset_length, max_description_length + 1)
- y is in the shape of (split_ratio*dataset_length,)

In [23]:
print('x_text_train\t(%d, %d)' % x_text_train.shape)
print('y_text_train\t(%d,)' % y_text_train.shape)
print('x_text_test\t(%d, %d)' % x_text_test.shape)
print('y_text_test\t(%d,)' % y_text_test.shape)

x_text_train	(1684, 276)
y_text_train	(1684,)
x_text_test	(421, 276)
y_text_test	(421,)


- Create text classification model consists of embedding layer, bidirectional LSTM, 8-unit Dense layer, and 1-unit output layer.
- The embedding layer is pre-trained, given the embedding matrix as an initializer. This layer is not trainable for this model.
- The last activation function is sigmoid to output a probability of being spam.

In [24]:
embedding_layer = Embedding(num_words,
                            embed_dim,
                            embeddings_initializer = Constant(embedding_matrix),
                            input_length = token_max_len,
                            trainable = False)
model_text = Sequential()

model_text.add(embedding_layer)
model_text.add(Bidirectional(LSTM(32)))
model_text.add(Dense(8, activation='relu'))
model_text.add(Dense(1, activation='sigmoid'))

model_text.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 276, 300)          1500600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 520       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 1,586,377
Trainable params: 85,777
Non-trainable params: 1,500,600
_________________________________________________________________


- Train the model.
- For each epochs, save the model to a file if it performs better.
- If it works as expected, the accuracy of validation data should be greater than 0.80.
- The training process is running until the accuracy does not improve for 20 epochs.

In [None]:
model_text_file = f'model_text_{datetime.now().strftime("%Y%m%d%H%M%S")}.h5'

checkpoint = ModelCheckpoint(model_text_file, monitor='val_acc', save_best_only=True, save_weights_only=False, mode='auto', period=1, verbose=1)
early = EarlyStopping(monitor='val_acc', min_delta=0, patience=20, verbose=1, mode='auto')

model_text.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'])

model_text.fit(x_text_train, y_text_train,
          batch_size=64,
          epochs=100,
          validation_data=(x_text_test, y_text_test),
          shuffle=True,
          callbacks=[checkpoint, early])


- The testing set splitted before is used to evaluate the model.
- The higher recall of the label 1 means the detected real spams are higher among all of real spams (From all of real spam, I can find and detect them).
- The higer precision of the label 1 means the detected real spams are higher among all of predicted spams (From all of predicted spam, they are really spams).
- Both two values are important for spam detection. The model has to detect the right spams and detect as many spams as possible at the same time.
- The F1 score is harmonic mean of those two values. Considering this value is better than either individually too high recall or individually too high precision.

In [None]:
model_text = load_model(model_text_file)

y_text_pred = model_text.predict(x_text_test)
y_text_pred_bool = np.array([y[0] > 0.5 for y in y_text_pred])

print(classification_report(y_text_test, y_text_pred_bool))