In [13]:
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

## Our goal is to predict if a message is spam or ham

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.pipeline import make_union
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [5]:
path = 'sms.tsv'
sms = pd.read_table(path, header=None, names=['label', 'message'])

## Exploratory Analysis

In [24]:
sms.head()

(5572, 2)

In [7]:
sms.dtypes

label      object
message    object
dtype: object

In [10]:
sms.loc[2, 'message']

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [11]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

## Text Pre-processing

In [17]:
# define X and y
X = sms.message
y = sms.label

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [21]:
vect = TfidfVectorizer()
X_dtm = vect.fit_transform(X_train)

In [34]:
count_vect = CountVectorizer()
X_dtm_count = count_vect.fit_transform(X_train)

In [22]:
print(vect.get_feature_names()[0:100])

['00', '000', '008704050406', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06', '07', '07008009200', '07090201529', '07090298926', '07123456789', '07732584351', '07734396839', '07742676969', '0776xxxxxxx', '07781482378', '07786200117', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '07880867867', '0789xxxxxxx', '07946746291', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '08', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '083', '0844', '08448714184', '0845', '08450542832', '08452810071', '08452810073', '08452810075over18', '0870', '08700435505150p', '08700469649', '08700621170150p', '08701213186', '08701417012', '08701417012150p', '0870141701216', '087016248

In [23]:
X_dtm.shape

(4179, 7456)

## Model Selection

### Naive Bayes

In [44]:
# TF-IDF
nb = MultinomialNB()
cross_val_score(nb, X_dtm, y_train, cv=5, scoring='accuracy').mean()

0.9542949942409088

In [45]:
pipe = make_pipeline(vect, MultinomialNB())
cross_val_score(pipe, X_train, y_train,cv=5, scoring='accuracy').mean()

0.9535781483627727

In [46]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
y_test_pred = pipe.predict(X_test)

In [59]:
confusion_matrix(y_test, y_test_pred)

array([[1208,    0],
       [  49,  136]])

In [58]:
print(classification_report(y_test, y_test_pred,target_names=['spam','ham']))

              precision    recall  f1-score   support

        spam       0.96      1.00      0.98      1208
         ham       1.00      0.74      0.85       185

   micro avg       0.96      0.96      0.96      1393
   macro avg       0.98      0.87      0.91      1393
weighted avg       0.97      0.96      0.96      1393



We identified all spams, but misclassified some hams also as spams
What we said as hams are all hams

## Logistic Regression

In [62]:
pipe = make_pipeline(vect, LogisticRegression())
cross_val_score(pipe, X_train, y_train,cv=5, scoring='accuracy').mean()



0.9662618848324855

In [63]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [64]:
y_test_pred = pipe.predict(X_test)

In [65]:
print(classification_report(y_test, y_test_pred,target_names=['spam','ham']))

              precision    recall  f1-score   support

        spam       0.97      1.00      0.99      1208
         ham       0.99      0.82      0.90       185

   micro avg       0.97      0.97      0.97      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.98      0.97      0.97      1393



Good Improvement over Naive Bayes