In [None]:
# Importing ML Libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing req libraries
import keras
import nltk
import re

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Embedding,GlobalAveragePooling1D,Flatten,LSTM
from keras.models import Sequential

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
# Importig Data
df = pd.read_csv('Data.txt',sep='\t',header=None)

In [None]:
df.head()

In [None]:
# Data Cleaning

X = df[1].values
eng_word = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

corpus=[]
for i in range(len(X)):
    review = re.sub('[^a-zA-Z]',' ',X[i])
    review = review.lower()
    review = review.split()
    review = [porter_stemmer.stem(word) for word in review if word not in eng_word]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
# Bag of Words and Naive's Bayes
cv = CountVectorizer()
corpus_cv = cv.fit_transform(corpus).toarray()

xtrain,xtest,ytrain,ytest = train_test_split(corpus_cv,df[0],test_size=0.3,random_state=0)

nb = MultinomialNB()
nb = nb.fit(xtrain,ytrain)
pred_nb = nb.predict(xtest)

print(accuracy_score(ytest,pred_nb))
print('\n')
print(confusion_matrix(ytest,pred_nb))

In [None]:
# Tf-Idf and Naive Bayes

tf_idf = TfidfVectorizer()
corpus_tf = tf_idf.fit_transform(corpus).toarray()

xtrain,xtest,ytrain,ytest = train_test_split(corpus_tf,df[0],test_size=0.3,random_state=0)

nb = MultinomialNB()
nb = nb.fit(xtrain,ytrain)
pred_nb = nb.predict(xtest)

print(accuracy_score(ytest,pred_nb))
print('\n')
print(confusion_matrix(ytest,pred_nb))

In [None]:
# Word-Embedding

tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

sequence = tokenizer.texts_to_sequences(corpus)
padded_seq  = pad_sequences(sequence)

xtrain,xtest,ytrain,ytest = train_test_split(padded_seq,df[0].map({'ham':0,'spam':1}),test_size=0.3,random_state=0)

In [None]:
# LSTM apply
vocab_size=10000
embedding_dim = 64
max_length = 77

model = Sequential()
model.add(Embedding(vocab_size,embedding_dim,input_length=max_length))
model.add(LSTM(units=70,activation='relu'))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(xtrain,ytrain,batch_size=32,epochs=15)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(history.history['accuracy'])

In [None]:
pred_ann = model.predict(xtest)
pred_ann = pred_ann>0.5

In [None]:
# Final results
print(accuracy_score(ytest,pred_ann))
print('\n')
print(confusion_matrix(ytest,pred_ann))