In [0]:
#@title 1. Understanding Data

In [0]:
import pandas as pd 

In [0]:
#Read the dataset into dataframe
#from google.colab import drive
#drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/Spam_Detection/spamEmails.csv')

In [0]:
df.head()   #Read first 5 occurances

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
df.columns #understanding the data

Index(['Label', 'EmailText'], dtype='object')

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from textblob import TextBlob
from textblob import Word

In [0]:
# applying preprocessing steps e.g lowercase, stemming, lemmatization
df['EmailText'] = df['EmailText'].apply(lambda x:' '.join(x.lower() for x in x.split()))

In [0]:
df['EmailText'].head()

0    go until jurong point, crazy.. available only ...
1                        ok lar... joking wif u oni...
2    free entry in 2 a wkly comp to win fa cup fina...
3    u dun say so early hor... u c already then say...
4    nah i don't think he goes to usf, he lives aro...
Name: EmailText, dtype: object

In [0]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# create English stop words dictionary
stop_words = stopwords.words('english')

In [0]:
# remove stopwords from 'email' feature 
df['EmailText'] = df['EmailText'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))

In [0]:
df['EmailText'].head()

0    go jurong point, crazy.. available bugis n gre...
1                        ok lar... joking wif u oni...
2    free entry 2 wkly comp win fa cup final tkts 2...
3            u dun say early hor... u c already say...
4              nah think goes usf, lives around though
Name: EmailText, dtype: object

In [0]:
# next, normalize each sentence using PorterStemmer algorithm 
# (1) create PorterStemmer object
st = PorterStemmer()

In [0]:
# (2) apply stemming on each email sentence
df['EmailText'] = df['EmailText'].apply(lambda x : ' '.join(st.stem(word) for word in x.split()))

In [0]:
df['EmailText'].head()

0    go jurong point, crazy.. avail bugi n great wo...
1                          ok lar... joke wif u oni...
2    free entri 2 wkli comp win fa cup final tkt 21...
3            u dun say earli hor... u c alreadi say...
4                nah think goe usf, live around though
Name: EmailText, dtype: object

In [0]:
# apply lemmatization
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
df['EmailText'] = df['EmailText'].apply(lambda x: ' '.join(Word(word).lemmatize() for word in x.split()))

In [0]:
# check fot the last change
df.head()

Unnamed: 0,Label,EmailText
0,ham,"go jurong point, crazy.. avail bugi n great wo..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,"nah think goe usf, live around though"


In [0]:
#@title 3. Feature Classification


In [0]:
import sklearn.feature_extraction.text as text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, naive_bayes, metrics, svm
import sklearn.linear_model as lm

In [0]:
# next, split dataset into training and validation using sklearn
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df['EmailText'], df['Label'])   #x - emails and Y - Labels (spam/ham)

In [0]:
# TF-IDF feature generation for a maximum of 5000 features
encoder = preprocessing.LabelEncoder()      #Label Encoding refers to converting the labels into numeric form so as to convert it into the machine-readable form.
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)      

In [0]:
train_y

array([1, 1, 0, ..., 0, 0, 0])

In [0]:
valid_y

array([0, 0, 0, ..., 0, 1, 0])

In [0]:
# create TF-IDF object which takes 5000 features
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)

In [0]:
tfidf_vect.fit(df['EmailText'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=True, vocabulary=None)

In [0]:
xtrain_tfidf = tfidf_vect.transform(train_x)

In [0]:
xtrain_tfidf

<4179x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 39294 stored elements in Compressed Sparse Row format>

In [0]:
xvalid_tfidf = tfidf_vect.transform(valid_x)

In [0]:
xvalid_tfidf

<1393x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 12458 stored elements in Compressed Sparse Row format>

In [0]:
xtrain_tfidf.data

array([0.23142803, 0.20074799, 0.16990768, ..., 0.42231885, 0.2763225 ,
       0.31066976])

In [0]:
xvalid_tfidf.data

array([0.16142324, 0.20565077, 0.29880906, ..., 0.25733305, 0.33809443,
       1.        ])

In [0]:
#@title 4. Model Training and Evaluation


In [0]:
# define a function for training any given model
def train_model(classifier, feature_vector_train, label):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    return classifier

 **- Naive Bayes Classifier**






In [0]:
nb_model = train_model(naive_bayes.MultinomialNB(alpha=0.2), xtrain_tfidf, train_y)

In [0]:
# Naive Bayes accuracy score
predictions = nb_model.predict(xvalid_tfidf)
nb_accuracy = metrics.accuracy_score(predictions, valid_y)
print("Accuracy: ", nb_accuracy)

Accuracy:  0.9870782483847811


**- Linear Classifier**

In [0]:
# Linear Classifier on Word Level TF IDF Vectors
linear_model = train_model(lm.LogisticRegression(), xtrain_tfidf, train_y)



In [0]:
# Linear model accuracy score
predictions = linear_model.predict(xvalid_tfidf)
linear_model_accuracy = metrics.accuracy_score(predictions, valid_y)
print("Accuracy: ", linear_model_accuracy)

Accuracy:  0.9641062455132807
