# Spam Email Prediction

## Importing Libraries

In [3]:
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Importing Dataset

In [5]:
data = pd.read_csv('spam.csv', encoding="ISO-8859-1")

In [12]:
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [13]:
X = data.iloc[:,1]

In [29]:
y=data.iloc[:,0].values

In [15]:
X[:5]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

## Cleaning Dataset

#### Importing Libraries

In [19]:
from nltk.tokenize import sent_tokenize, word_tokenize
stop_word = set(stopwords.words('english'))
ps = PorterStemmer()

#### Scratch code to clean dataset

In [18]:
corp = []
for i in X:
    word_filter = []
    word = word_tokenize(i)
    for j in word:
        if j not in stop_word:
            word_filter.append(j.lower())
    corp.append(word_filter)

In [21]:
X=[]
for i in corp:
    port = []
    for j in i:
        port.append(ps.stem(j))
    X.append(' '.join(port))

In [23]:
X[:5]

['go jurong point , crazy.. avail bugi n great world la e buffet ... cine got amor wat ...',
 'ok lar ... joke wif u oni ...',
 "free entri 2 wkli comp win fa cup final tkt 21st may 2005 . text fa 87121 receiv entri question ( std txt rate ) t & c 's appli 08452810075over18 's",
 'u dun say earli hor ... u c alreadi say ...',
 "nah i n't think goe usf , live around though"]

#### Implementing CountVectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cn = CountVectorizer(max_features = 2000)
X = cn.fit_transform(X).toarray()

In [28]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [30]:
y

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

#### Data Encoding

In [31]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
y=label.fit_transform(y)

In [71]:
y

array([0, 0, 1, ..., 0, 0, 0])

## Splitting data into Train and Test set

In [32]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=0)

## Applying Naive_bayes Algorithm to dataset

In [33]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Prediction for x_test

In [35]:
y_pred = classifier.predict(x_test)

## Confusion Matrix

In [40]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
cm

array([[1204,  230],
       [  16,  222]], dtype=int64)