In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Group based on 'Category' to understand that 4825 are ham (good email) and 747 are spam(bad email)
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
# Convert the categorical column such as 'Category' into numerical column by using apply function
# return 1 if it is a spam otherwise 0
# Create a new column such as 'spam'
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
# do the train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam, test_size=0.25)

In [8]:
# We will use CountVectorizer technique to convert 'message' column into numerical columns
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
# We are using Mulinomial Naive Bayes for training the model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train) # X_train_count is the CountVectorizer of 'message' column

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
# Let's have 2 emails
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

# Convert the emails into count vectorizer
emails_count = v.transform(emails)
# You can see that the 2nd email has been predicted as spam(1)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [13]:
X_test_count = v.transform(X_test) # Convert the X_test into count vectorizer
model.score(X_test_count, y_test) # get the accuracy of the model

0.9849246231155779

**Sklearn Pipeline**

In [15]:
# You can achieve the above by simplifying the above code base by using Sklearn pipeline

In [16]:
from sklearn.pipeline import Pipeline
# Create the pipeline with 2 steps:
# 1st step --> convert my text into the vector of CountVectorizer
# 2nd step --> Then apply the MultinomialNB
# This is how we can create the classifier (clf)
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [17]:
# Train by using X_train and y_train directly. no need of converting as we did in previous example as the conversion already
# take care in the pipleline creation
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [18]:
# Check the accuracy of the model
clf.score(X_test,y_test)

0.9849246231155779

In [19]:
# You can see that the 2nd email has been predicted as spam(1)
clf.predict(emails)

array([0, 1], dtype=int64)