## Spam Email

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [5]:
## The spam Columns is text and we will convert that into numbers as ML models better work with numbers

In [6]:
## So we will convert category and message into numbers

In [7]:
## In category there are only two value ham, spam so we can convert that easily

In [8]:
df['spam'] = df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [9]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam, test_size=0.25)

In [12]:
## We still have message column which is text so we have to convert that into numbers

In [13]:
## we will do this using countvectorizer technique

In [14]:
## In count vectorizer technique let's see we have 4 statemnet

In [15]:
##  One of the ways to convert these statemnets into vectors and matrices is that

In [16]:
## we will find the unique word in the document

![14.JPG](attachment:14.JPG)

![15.JPG](attachment:15.JPG)

In [17]:
## We can see that there are nine unique words

In [18]:
## Now we can take these nine words as feature

In [19]:
## And we can build kind of matrix

![16.png](attachment:16.png)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document', 'This is the second docuent', 'This is the third one', 'Is this the first document?']
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['docuent', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
corpus = ['This is the first document', 'This document the second document', 'This is the third one', 'Is this the first document?']


In [33]:
vectorizer = CountVectorizer()

In [34]:
x = vectorizer.fit_transform(corpus)

In [35]:
print(vectorizer.get_feature_names())

['document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [36]:
print(x.toarray())

[[1 1 1 0 0 1 0 1]
 [2 0 0 0 1 1 0 1]
 [0 0 1 1 0 1 1 1]
 [1 1 1 0 0 1 0 1]]


In [37]:
## Now let's implement this in our problem

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

In [39]:
v = CountVectorizer()

In [40]:
x_train_count = v.fit_transform(X_train.values)

In [41]:
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [42]:
from sklearn.naive_bayes import MultinomialNB

In [43]:
model = MultinomialNB()

In [44]:
model.fit(x_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [45]:
emails = ['Hey mohan, can we get together to watch football match tommarow?',
           'Upto 20% discount on parking, exclusive offer just for you . Dont miss this reward']

In [46]:
emails_count = v.transform(emails)

In [47]:
model.predict(emails_count)

array([0, 1], dtype=int64)

In [51]:
X_test_count = v.transform(X_test)

In [52]:
model.score(X_test_count, y_test)

0.9856424982053122

## we can do this using sklearn also by pipeline

In [53]:
from sklearn.pipeline import Pipeline

In [57]:
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [58]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [59]:
clf.score(X_test,y_test)

0.9856424982053122

In [61]:
clf.predict(emails)

array([0, 1], dtype=int64)