In [1]:
# Import packages and functions
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# Read in the data and view the first five instances.
# File does not include column headers so they are provided via names.
messages = pd.read_table('SMSSpamCollection', names=['Class', 'Message'])
messages.head()

Unnamed: 0,Class,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Split into testing and training sets
X_train, X_test, Y_train, Y_test = train_test_split(
    messages['Message'], messages['Class'], random_state=20220530
)

In [4]:
# Count the words that appear in the messages
vectorizer = CountVectorizer(ngram_range=(1, 1))
vectorizer.fit(X_train)
# Uncomment the line below to see the words.
#vectorizer.vocabulary_

CountVectorizer()

In [5]:
# Count the words in the training set and store in a matrix
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized

<4179x7474 sparse matrix of type '<class 'numpy.int64'>'
	with 55755 stored elements in Compressed Sparse Row format>

In [6]:
# Initialize the model and fit with the training data
NBmodel = MultinomialNB()
NBmodel.fit(X_train_vectorized, Y_train)

MultinomialNB()

In [7]:
# Make predictions onto the training and testing sets.
trainPredictions = NBmodel.predict(vectorizer.transform(X_train))
testPredictions = NBmodel.predict(vectorizer.transform(X_test))

In [8]:
# How does the model work on the training set?
confusion_matrix(Y_train, trainPredictions)

array([[3610,   10],
       [  18,  541]])

In [9]:
# Display that in terms of correct porportions
confusion_matrix(Y_train, trainPredictions, normalize='true')

array([[0.99723757, 0.00276243],
       [0.03220036, 0.96779964]])

99.7% of real messages are classified correctly.
Just over 3% of spam messages are thought to be real.

In [10]:
# How does the model work on the test set?
confusion_matrix(Y_test, testPredictions, normalize='true')

array([[0.99585062, 0.00414938],
       [0.07446809, 0.92553191]])

About 7.5% of spam messages are classified as real in the test data and only 0.4 % of real messages are classified as spam.

In [11]:
# Predict some phrases. Add your own.
NBmodel.predict(
    vectorizer.transform(
        ["Big sale today! Free cash.",
        "I'll be there in 5"]))

array(['spam', 'ham'], dtype='<U4')