TASK-3
EMAIL SPAM DETECTION WITH MACHINE LEARNING

GITHUB REPO LINK: https://github.com/zoyatabassum/OIBSIP

In [46]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
warnings.filterwarnings("ignore")

In [47]:
#READING OF THE DATASET
data = pd.read_csv('spam.csv', encoding='latin-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [48]:
#CHECKING FOR THE NULL VALUES IN THE DATASET
data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [49]:
#DESCRIBING OF THE DATASET
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [50]:
#DISPLAYING THE FIRST ROWS OF DATASET
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [51]:
#KEEPING ONLY NECESSARY COLUMNS
data = data[['v1', 'v2']] 
#RENAMING OF THE COLUMNS
data.columns = ['label', 'message'] 

In [52]:
data.columns

Index(['label', 'message'], dtype='object')

In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [54]:
#CONVERTING LABELS
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [55]:
#DATA CLEANING
data['message'] = data['message'].str.lower().str.replace(r'[^\w\s]', '')


In [56]:
#SPLITING OF DATSET
X_train, X_test, y_train, y_test = train_test_split(data['message'],data['label'], test_size=0.2)

In [57]:
#VECTORIZING THE TEXT
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)

In [58]:
#TRANSFORMING THE COUNTS INTO A TF-IDF REPRESENTATION
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [59]:
#TRAIN THE NAIVE BAYES CLASSIFIER
model = MultinomialNB().fit(X_train_tfidf, y_train)

In [60]:
#TRANSFORM THE TEST DATA AND PREDICT
X_test_counts = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [61]:
#PREDICTING THE LABELS
y_pred = model.predict(X_test_tfidf)

In [62]:
#EVALUATING THE MODELS
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.9426008968609866
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       947
           1       1.00      0.62      0.76       168

    accuracy                           0.94      1115
   macro avg       0.97      0.81      0.87      1115
weighted avg       0.95      0.94      0.94      1115



In [63]:
#PREDICTING WHETHER THE NEW EMAIL IS SPAM OR NOT
new_email=["THANKYOU OASIS FOR THIS OPPORTUNITY"]
new_email_vectorized=count_vectorizer.transform(new_email)
predicted_label=model.predict(new_email_vectorized)
if predicted_label[0]==0:
    print("predicted as not spam")
else:
    print("predicted as spam")

predicted as not spam
