# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import nltk
import string
import pickle
import tensorflow as tf

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, LSTM, Dropout, Bidirectional, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load Data

In [None]:
df = pd.read_csv("/mnt/hdd/Datasets/spam.csv", encoding="latin-1")
df.head()

In [None]:
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1, inplace=True)

In [None]:
df["v1"].value_counts().plot(kind="pie", autopct="%.1f%%", startangle=90)

In [None]:
df["v1"].value_counts().plot(kind="bar", label)

In [None]:
def word_freq(CATEGORY, TEXTS):
    freq_df = df[df["v1"] == CATEGORY]

    freq_words = freq_df[TEXTS].tolist()
    freq_words = [i.lower() for i in freq_words]
    freq_punc = []

    for o in freq_words:
        freq_punc += word_tokenize(o)

    puncs = string.punctuation
    puncs += ".."
    puncs += "..."
    
    freq_punc = [o for o in freq_punc if o not in puncs]
    freq_freq = Counter(freq_punc)

    freq_top = freq_freq.most_common(50)
    
    words = [word for word, _ in freq_top[:15]]
    counts = [counts for _, counts in freq_top[:15]]

    plt.figure()
    plt.bar(words, counts)
    plt.title(f"TOP 15 WORDS IN {CATEGORY}")
    plt.ylabel("Frequency")
    plt.xlabel("Words")
    plt.show()

    return freq_top

In [None]:
ham_top = word_freq("ham", "v2")

In [None]:
spam_top = word_freq("spam", "v2")

In [None]:
def print_wordcloud(dict_top):
    dict_top = dict(dict_top)

    wordcloud = WordCloud(width=350, height=350, background_color="white").generate_from_frequencies(dict_top)

    plt.imshow(wordcloud)
    #plt.axis("off")
    plt.tight_layout(pad=0)

In [None]:
plt.subplot(1, 2, 1)
print_wordcloud(ham_top)
plt.title("TOP 50 Words - Ham")
plt.subplot(1, 2, 2)
print_wordcloud(spam_top)
plt.title("TOP 50 Words - Spam")
plt.show()

# Preprocess

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def clean(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    text = text.strip()
    return text

In [None]:
df["v3"] = df["v2"].apply(clean)

In [None]:
df.head()

In [None]:
pre_ham_top = word_freq("ham", "v3")

In [None]:
pre_spam_top = word_freq("spam", "v3")

In [None]:
plt.subplot(1, 2, 1)
print_wordcloud(pre_ham_top)
plt.title("TOP 50 Words after preprocess - Ham")
plt.subplot(1, 2, 2)
print_wordcloud(pre_spam_top)
plt.title("TOP 50 Words after preprocess - Spam")
plt.show()

In [None]:
def label_encoder(column):
    le = LabelEncoder().fit(column)
    print(column.name, le.classes_)
    return le.transform(column)

In [None]:
df["v4"] = label_encoder(df["v1"])

# Model

In [None]:
X = df["v3"]
y = df["v4"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
maxlen = max([len(text) for text in X_train])

In [None]:
maxlen

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
with open('tokenizer.pkl', 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=maxlen)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
input_dim = len(tokenizer.word_index) + 1
input_dim

In [None]:
smote = SMOTE()

In [None]:
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=100, input_length=maxlen))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=32, return_sequences=True))
model.add(LSTM(units=16))
model.add(Dense(units=1, activation="sigmoid"))

In [None]:
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
callbacks = [EarlyStopping(monitor="val_loss", patience=5)]

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=callbacks)

In [None]:
model.save("spam.h5")

In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label="train")
plt.plot(history.history["val_accuracy"], label="valid")
plt.title("Model Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy %")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="valid")
plt.title("Model Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss %")
plt.legend()
plt.show()

# Predict

In [None]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
class_names = ["ham", "spam"]
plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, class_names=class_names)
plt.title("Model Confusion Matrix")
plt.show()

In [None]:
content = "Free entry in 2 a wkly comp to win FA Cup final"
context_clean = clean(content)
test = tokenizer.texts_to_sequences([context_clean])
test = pad_sequences(test, maxlen=maxlen)
is_spam = model.predict(test)
is_spam = np.round(is_spam)[0][0]
print(is_spam)