In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('hamspam.tsv',sep='\t',names=['Output','Message'])

In [3]:
dataset.head()

Unnamed: 0,Output,Message
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
Output     5568 non-null object
Message    5568 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [5]:
dataset.describe()

Unnamed: 0,Output,Message
count,5568,5568
unique,2,5165
top,ham,"Sorry, I'll call later"
freq,4822,30


In [6]:
dataset['Length'] = dataset['Message'].apply(len)

In [7]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,ham,I've been searching for the right words to tha...,196
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,ham,"Nah I don't think he goes to usf, he lives aro...",61
3,ham,Even my brother is not like to speak with me. ...,77
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,35


In [8]:
dataset.groupby('Output').count()

Unnamed: 0_level_0,Message,Length
Output,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4822,4822
spam,746,746


In [9]:
dataset['Length'].describe()

count    5568.000000
mean       80.487428
std        59.950961
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64

In [10]:
## Data Processing

In [11]:
y = dataset['Output'].values

In [12]:
y

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [13]:
## Convert value of Ham =1 & Spam =0

In [14]:
dataset.loc[dataset['Output']=="ham","Output"] = 1

In [15]:
dataset.loc[dataset['Output']=="spam","Output"] = 0

In [16]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,1,I've been searching for the right words to tha...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,"Nah I don't think he goes to usf, he lives aro...",61
3,1,Even my brother is not like to speak with me. ...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL!!,35


In [17]:
##Process On Messages

In [18]:
def cleanMessage(message):
    nonPunc = [char for char in message if char not in string.punctuation]
    nonPunc = "".join(nonPunc)
    return nonPunc

In [19]:
dataset['Message'] = dataset['Message'].apply(cleanMessage)

In [20]:
dataset.head()

Unnamed: 0,Output,Message,Length
0,1,Ive been searching for the right words to than...,196
1,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
2,1,Nah I dont think he goes to usf he lives aroun...,61
3,1,Even my brother is not like to speak with me T...,77
4,1,I HAVE A DATE ON SUNDAY WITH WILL,35


In [21]:
CV = CountVectorizer(stop_words="english")

In [22]:
X = dataset['Message'].values
X

array(['Ive been searching for the right words to thank you for this breather I promise i wont take your help for granted and will fulfil my promise You have been wonderful and a blessing at all times',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       'Nah I dont think he goes to usf he lives around here though', ...,
       'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

In [23]:
y = dataset['Output'].values
type(y)

numpy.ndarray

In [24]:
##Train And Test Data

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [26]:
## Data Munging

In [27]:
X_train_CV = CV.fit_transform(X_train)

In [28]:
##  Naive Bayse :- Multinomial

In [29]:
NB = MultinomialNB()

In [30]:
NB.fit(X_train_CV,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
X_test_CV = CV.transform(X_test)

In [32]:
Y_predict = NB.predict(X_test_CV)

In [33]:
result = accuracy_score(y_test,Y_predict)

In [34]:
print("Accuracy Of Prediction :-",result*100)

Accuracy Of Prediction :- 98.65350089766606


In [35]:
## Realtime Application Of Spam Filtering :-

In [36]:
email  = input("Enter Email :- ")
body = input("Enter Body Of Content :- ")
bodyInput = CV.transform([body])
result = NB.predict(bodyInput)
if(result[0]==0):
    print("This Is Spam Mail")
else:
    print("Email Sent")

Enter Email :- abc
Enter Body Of Content :- hey
Email Sent
