In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import re

In [2]:
sms_df = pd.read_csv(r'C:\Users\taewoo\Desktop\Datasets\spam_sms.csv', encoding='latin-1')
sms_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
for column in sms_df.columns:
    print(f"{column} NaN value count: {sms_df[column].isnull().sum()}")
    print(f"{column} non-NaN value count: {sms_df[column].notna().sum()}")

v1 NaN value count: 0
v1 non-NaN value count: 5572
v2 NaN value count: 0
v2 non-NaN value count: 5572
Unnamed: 2 NaN value count: 5522
Unnamed: 2 non-NaN value count: 50
Unnamed: 3 NaN value count: 5560
Unnamed: 3 non-NaN value count: 12
Unnamed: 4 NaN value count: 5566
Unnamed: 4 non-NaN value count: 6


Unnamed: 2, 3, 4 are not useful so I'm dropping those columns

In [4]:
sms_df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [5]:
sms_df.rename(columns={'v1':'class', 'v2':'text message'}, inplace=True)
sms_df['text message'] = [re.sub(r"[^a-z0-9 ]+", ' ', x.lower()) for x in sms_df['text message']]

In [6]:
X = np.array(sms_df['text message'])
y = np.array(sms_df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

NB_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
NB_model.fit(X_train, y_train)
NB_predict = NB_model.predict(X_test)
print(f"Training Score: {NB_model.score(X_train, y_train)}")
print(f"Test Score: {NB_model.score(X_test, y_test)}")

Training Score: 0.9740725967291584
Test Score: 0.978494623655914


All I have done is creating a model with a pipeline between a text-vectorizer and a classifier so that when blocks of texts are fed as data, the model can make classification predictions by examining vectors.  

I'm surprised to see such high scores for both training and test samples.

In [7]:
NB_result_df = pd.DataFrame({'prediction':NB_predict, 'class':y_test, 'match':NB_predict==y_test, 'text message':X_test})
NB_result_df.head()

Unnamed: 0,prediction,class,match,text message
0,ham,ham,True,hmm well night night
1,ham,ham,True,in sch but neva mind u eat 1st lor
2,ham,ham,True,lol no just trying to make your day a little ...
3,ham,ham,True,meet after lunch la
4,ham,ham,True,i noe la u wana pei bf oso rite k lor other...


In [8]:
NB_wrong = NB_result_df.loc[NB_result_df['match'] == False]
print(f"Predicted correctly: {(NB_result_df.match == True).sum()}")
print(f"Predicted wrong: {(NB_result_df.match == False).sum()}")
print(f"Accuracy: {round(NB_model.score(X_test, y_test)*100, 2)}%")

Predicted correctly: 546
Predicted wrong: 12
Accuracy: 97.85%


In [9]:
print(NB_wrong)

    prediction class  match                                       text message
41         ham  spam  False  freemsg today s the day if you are ready  i m ...
98         ham  spam  False  can u get 2 phone now  i wanna chat 2 set up m...
129        ham  spam  False  what do u want for xmas  how about 100 free te...
136        ham  spam  False  someone u know has asked our dating service 2 ...
171        ham  spam  False  22 days to kick off  for euro2004 u will be ke...
221        ham  spam  False  free message  jamster get the crazy frog sound...
307        ham  spam  False   forwarded from 21870000 hi   this is your mai...
368        ham  spam  False  welcome  please reply with your age and gender...
374        ham  spam  False  for sale   arsenal dartboard  good condition b...
443        ham  spam  False  fantasy football is back on your tv  go to sky...
540        ham  spam  False                                      2 2 146tf150p
556        ham  spam  False  am new 2 club   dont fi

Although the model makes mistakes, I am happy with the result

Let's perform the same analysis with the RandomForestClassifier and compare results

In [10]:
RFC_model = make_pipeline(TfidfVectorizer(), RandomForestClassifier(100))
RFC_model.fit(X_train, y_train)
RFC_predict = RFC_model.predict(X_test)
print(f"Training Score: {RFC_model.score(X_train, y_train)}")
print(f"Test Score: {RFC_model.score(X_test, y_test)}")

Training Score: 1.0
Test Score: 0.9910394265232975


In [11]:
RFC_result_df = pd.DataFrame({'prediction':RFC_predict, 'class':y_test, 'match':RFC_predict==y_test, 'text message':X_test})
RFC_result_df.head()

Unnamed: 0,prediction,class,match,text message
0,ham,ham,True,hmm well night night
1,ham,ham,True,in sch but neva mind u eat 1st lor
2,ham,ham,True,lol no just trying to make your day a little ...
3,ham,ham,True,meet after lunch la
4,ham,ham,True,i noe la u wana pei bf oso rite k lor other...


In [12]:
RFC_wrong = RFC_result_df.loc[RFC_result_df['match'] == False]
print(f"Predicted correctly: {(RFC_result_df.match == True).sum()}")
print(f"Predicted wrong: {(RFC_result_df.match == False).sum()}")
print(f"Accuracy: {round(RFC_model.score(X_test, y_test)*100, 2)}%")

Predicted correctly: 553
Predicted wrong: 5
Accuracy: 99.1%


In [13]:
print(RFC_wrong)

    prediction class  match                                       text message
98         ham  spam  False  can u get 2 phone now  i wanna chat 2 set up m...
368        ham  spam  False  welcome  please reply with your age and gender...
374        ham  spam  False  for sale   arsenal dartboard  good condition b...
540        ham  spam  False                                      2 2 146tf150p
556        ham  spam  False  am new 2 club   dont fink we met yet will b gr...


Both RFC_model and NB_model yielded decent results, but RandomForestClassifier **outperformed** MultinomialNB every time by a slight margin (1-2%).