<h1><b>Importing the necessary Libraries</b></h1>

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import spacy
import nltk
import re,string,unicodedata
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup
from nltk.corpus import stopwords   
import numpy as np
import pandas as pd

<h1><b>Dataset Exploration</b></h1>

In [2]:
data=pd.read_csv("spam.csv",encoding="ISO-8859-1")

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
plus_col = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

In [6]:
new_data = data.drop(plus_col,axis =1)

In [15]:
new_data = new_data.rename(columns = {'v1':'target', 'v2':'email'})

In [17]:
new_data.duplicated().sum()

0

In [18]:
new_data = new_data.drop_duplicates(keep = 'first')

In [19]:
new_data.isnull().any()

target    False
email     False
dtype: bool

In [16]:
new_data.head()

Unnamed: 0,target,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
new_data.describe()

Unnamed: 0,target,email
count,5169,5169
unique,2,5169
top,ham,"Go until jurong point, crazy.. Available only ..."
freq,4516,1


<h1><b>NLP Tools Application on the text Field</b></h1>

<h3><b>Summary of The Steps Applied here</b></h3>
<p>in this section i tried to prepare the text in the v2 field in a way that it would be in a much better shape while using it for creating the model</p>
<p>the steps are very simple </p>
<ul>
<li>first removing all the html tags and special noisy characters from the text since they don't provide any useful informations for our model</li>
<li>second stemming the text which means removing the prefixes and suffixes that are added to words (like 's' in the plural form and the 's' in verbs conjugated with "he she and it", this transformation is very useful and important since we're going to apply the vectorization on this text and by applying this step we will avoid replication of words like instead of having 'like' and 'likes' we will just have 'like' in the set of features </li>
<li>third removing stopwords and and punctuations since the first one mentioned don't provide any useful informations and about the punctuation it is also recommended to remove them </li>
</ul>


In [21]:
#Removing the noisy text
def noiseremoval_text(text):
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text()
  text = re.sub('\[[^]]*\]', '', text)
  return text

In [22]:
new_data['email']=new_data['email'].apply(noiseremoval_text)

  soup = BeautifulSoup(text, "html.parser")


In [23]:
new_data.head()

Unnamed: 0,target,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
#Stemming the text
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
    
new_data['email']=new_data['email'].apply(stemmer)

new_data.head()

Unnamed: 0,target,email
0,ham,"go until jurong point, crazy.. avail onli in b..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri in 2 a wkli comp to win fa cup fina...
3,ham,u dun say so earli hor... u c alreadi then say...
4,ham,"nah i don't think he goe to usf, he live aroun..."


In [25]:
stop_wr=set(stopwords.words('english'))

def remove_punc_stopword(text):

    remove_punc = [word for word in text.lower() if word not in string.punctuation]
    remove_punc = ''.join(remove_punc)
    return [word for word in remove_punc.split() if word not in stop_wr]

In [26]:
cleaned_data = new_data.copy()
cleaned_data['email'] = new_data['email'].apply(remove_punc_stopword)
cleaned_data.head()

Unnamed: 0,target,email
0,ham,"[go, jurong, point, crazy, avail, onli, bugi, ..."
1,ham,"[ok, lar, joke, wif, u, oni]"
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin..."
3,ham,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"[nah, dont, think, goe, usf, live, around, tho..."


In [27]:
cleaned_data['original_text'] = cleaned_data['email'].apply(lambda words: ' '.join(words))

y = cleaned_data['target']
x = cleaned_data['original_text']

print(cleaned_data['original_text'][0:5])

0    go jurong point crazy avail onli bugi n great ...
1                                ok lar joke wif u oni
2    free entri 2 wkli comp win fa cup final tkt 21...
3                  u dun say earli hor u c alreadi say
4            nah dont think goe usf live around though
Name: original_text, dtype: object


<h1><b>Transforming the dataset into a Sparse Matrix</b></h1>

<p>here i transformed the text found in the email field into a bag of words which means i will create a matrix that for every new distinct word found in the field of email it will count the number of its occurences across all the records but for testing i kept only 200 words or 'features' that are most frequent using the max_features parameter with  the countvectorizer object</p>

In [28]:
#Count vectorizer for bag of words

cv = CountVectorizer(ngram_range=(1,3),max_features= 200)
x_cv = cv.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(x_cv,y, test_size=0.2, random_state = 1)
print("Shape of the sparse matrix: ", x_cv.shape)

Shape of the sparse matrix:  (5169, 200)


In [29]:
df_vectorized = pd.DataFrame(x_cv.toarray(), columns=cv.get_feature_names_out())
df_vectorized

Unnamed: 0,alreadi,already,also,alway,ani,anyth,around,ask,award,babe,...,word,work,would,ya,ye,yeah,year,yet,yup,ìï
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h1><b>Model Creation and Evaluation</b></h1>

<h5><b>here i created two different models that are largely used in the field of NLP which are Logistic Regression and Naive Bayes</b></h5>

In [30]:
mnb = MultinomialNB()

mnb.fit(x_train,y_train)
predmnb = mnb.predict(x_test)

print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))

Confusion Matrix for Multinomial Naive Bayes:
[[881  18]
 [ 31 104]]
Score: 95.26
Classification Report:               precision    recall  f1-score   support

         ham       0.97      0.98      0.97       899
        spam       0.85      0.77      0.81       135

    accuracy                           0.95      1034
   macro avg       0.91      0.88      0.89      1034
weighted avg       0.95      0.95      0.95      1034



In [31]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, y_train)

predclf = clf.predict(x_test)

print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predclf))
print("Score:",round(accuracy_score(y_test,predclf)*100,2))
print("Classification Report:",classification_report(y_test,predclf))

Confusion Matrix for Multinomial Naive Bayes:
[[893   6]
 [ 37  98]]
Score: 95.84
Classification Report:               precision    recall  f1-score   support

         ham       0.96      0.99      0.98       899
        spam       0.94      0.73      0.82       135

    accuracy                           0.96      1034
   macro avg       0.95      0.86      0.90      1034
weighted avg       0.96      0.96      0.96      1034

