In [1]:
# TERM PROJECT
# Spam Email Filter 

In [2]:
import numpy as np 
import pandas as pd 
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
# This shows the first 5 rows of the data
df = pd.read_csv('emails.csv')
df.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
# This shows the number of rows and columns
df.shape

(5728, 2)

In [5]:
# This shows the names of columns
df.columns

Index(['text', 'spam'], dtype='object')

In [6]:
# This function removes the duplicate email addresses in the dataset
df.drop_duplicates(inplace = True)

In [7]:
# This shows the updated number of rows and columns
df.shape

(5695, 2)

In [8]:
# This shows the empty or null valued columns
df.isnull().sum()

text    0
spam    0
dtype: int64

In [9]:
# Now we're about to convert texts into matrix
df_x=df["text"]
df_y=df["spam"]

In [10]:
# Printing the df_x and df_y
print(df_x)
print()
print(df_y)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5695, dtype: object

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5695, dtype: int64


In [11]:
# Here I have used TfidfVectorizer, as it gives more accurate result
cv=TfidfVectorizer(min_df=1,stop_words='english')
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.20, random_state=0)

In [12]:
x_traincv=cv.fit_transform(x_train)
x_testcv=cv.transform(x_test)

In [13]:
a=x_traincv.toarray()

In [14]:
a

array([[0.04684563, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [15]:
mnb=MultinomialNB()
y_train=y_train.astype('int')
mnb.fit(x_traincv,y_train)

MultinomialNB()

In [16]:
pred=mnb.predict(x_testcv)
pred

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
actual_results=np.array(y_test)
actual_results

array([1, 0, 0, ..., 0, 0, 0])

In [18]:
count=0
for i in range(len(pred)):
    if pred[i]==actual_results[i]:
        count=count+1

In [19]:
count

1038

In [20]:
len(pred)

1139

In [21]:
# We get the accuracy by dividing count by length of prediction.
count/len(pred)

0.9113257243195786