In [1]:
from matplotlib import test
import numpy as np
import pandas as pd
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, LSTM, Embedding, Dropout
from sklearn.metrics import confusion_matrix,f1_score, precision_score,recall_score
import seaborn as sns
import matplotlib.pyplot as plt

import re
import string

In [2]:
def getData(file):
    # đọc file csv
    f = pd.read_csv(file)
    # check duplicates and remove
    f.drop_duplicates(inplace=True)
    return f

In [15]:
#Loai bo link
def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

#Chuyen ve chu viet thuong
def to_lower(word):
    result = word.lower()
    return result

#Loai bo cac chu so
def remove_number(word):
    result = re.sub(r'\d+','', word)
    return result
    
#Loai bo dau cham cau
def remove_punctuation(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

#Loai bo khoang trang hai ben van ban
def remove_whitespace(word):
    result = word.strip()
    return result

#Loai bo dau xuong dong
def replace_newline(word):
    return word.replace('\n','')

#Tong hop cac ham lai de lam sach du lieu
def clean_up_pipeline(sentence):
    cleaning_utils = [remove_hyperlink,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,
                      remove_whitespace]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

In [17]:
def tokenizing(x_train, x_test, max_len=20):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x_train)
    x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
    x_test_features = np.array(tokenizer.texts_to_sequences(x_test))

    #  padding
    x_train_features = pad_sequences(x_train_features, maxlen=max_len)
    x_test_features = pad_sequences(x_test_features, maxlen=max_len)
    return x_train_features, x_test_features

In [18]:
def label_target(y_train, y_test):
    # Chuyển về array
    le = LabelEncoder()
    train_y = le.fit_transform(y_train.values)
    test_y = le.transform(y_test.values)
    return train_y, test_y

In [19]:
def LMTS(input_length, input_dim, x_train, x_test, y_train, y_test):
    lstm_model = Sequential()
    #Creating an embedding layer to vectorize
    lstm_model.add(Embedding(input_dim=input_dim+1, output_dim=20, input_length=input_length))
    #Addding LSTM
    lstm_model.add(LSTM(64))
    # Relu allows converging quickly and allows backpropagation
    lstm_model.add(Dense(16, activation='relu'))
    #Deep Learninng models can be overfit easily, to avoid this, we add randomization using drop out
    lstm_model.add(Dropout(0.1))
    # Adding sigmoid activation function to normalize the output
    lstm_model.add(Dense(1, activation='sigmoid'))

    lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    lstm_model.summary()
    history = lstm_model.fit(x_train, y_train, epochs=50, batch_size=512, 
                        validation_data=(x_test, y_test))
    y_predict = [1 if o>0.5 else 0 for o in lstm_model.predict(x_test)]
    return history, y_predict

In [20]:
def evaluating(test_y, y_predict):
    cf_matrix =confusion_matrix(test_y,y_predict)
    print("Precision: {:.2f}%".format(100 * precision_score(test_y, y_predict)))
    print("Recall: {:.2f}%".format(100 * recall_score(test_y, y_predict)))
    print("F1 Score: {:.2f}%".format(100 * f1_score(test_y,y_predict)))
    ax= plt.subplot()
    #annot=True to annotate cells
    sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='')
    # labels, title and ticks
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam'])
    plt.show()

In [None]:
if __name__ == "__main__":
    df = getData("./spam.csv")
    emails_train, emails_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2)

    x_train = [clean_up_pipeline(o) for o in emails_train]
    x_test = [clean_up_pipeline(o) for o in emails_test]

    train_x, test_x = tokenizing(x_train, x_test)
    y_train, y_test = label_target(y_train, y_test)
    lmts, y_predict = LMTS(20,7982, train_x, test_x, y_train, y_test)
    evaluating(y_test, y_predict)

  x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
  x_test_features = np.array(tokenizer.texts_to_sequences(x_test))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 20)            159660    
                                                                 
 lstm (LSTM)                 (None, 64)                21760     
                                                                 
 dense (Dense)               (None, 16)                1040      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 182,477
Trainable params: 182,477
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 