### Chapter 3 Classification
#### Spam Detection Exercise
##### I choose a different dataset from Kaggle because I don't want to waste time on the complicated dataset provided in the book: https://www.kaggle.com/karthickveerakumar/spam-filter

#### Data Load

In [1]:
import pandas as pd

dataset = pd.read_csv("datasets/spam/emails.csv")
dataset.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [11]:
dataset['spam'].value_counts() / len(dataset)

0    0.761173
1    0.238827
Name: spam, dtype: float64

#### Train Test Split

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit

In [8]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(dataset, dataset['spam']):
    train_data = dataset.loc[train_index]
    train_label = dataset['spam'].loc[train_index]
    test_data = dataset.loc[test_index]
    test_label = dataset['spam'].loc[test_index]

In [10]:
train_data['spam'].value_counts() / len(train_data)

0    0.76124
1    0.23876
Name: spam, dtype: float64

In [12]:
train_data.drop("spam", axis=1, inplace=True)
test_data.drop("spam", axis=1, inplace=True)

#### Data Preprocessing

In [205]:
from sklearn.base import BaseEstimator, TransformerMixin
import string
import re
from nltk.stem.porter import PorterStemmer

class EmailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, remove_punct=False, replace_url=False, replace_num=False, stemming=False):
        self.remove_punct = remove_punct
        self.replace_url = replace_url
        self.replace_num = replace_num
        self.stemming = stemming
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.astype(np.str)
        X = np.array(X)
        if (self.remove_punct):
            stripPunct = str.maketrans('', '', string.punctuation)
            X = np.array([i.translate(stripPunct) for i in X])
        if (self.replace_url):
            X = np.array([re.sub(r'^https?:\/\/.*[\r\n]*', 'URL', i, flags=re.MULTILINE) for i in X])
        if (self.replace_num):
            X = np.array([re.sub(r'\d+', 'NUM', i, flags=re.MULTILINE) for i in X])
        if (self.stemming):
            ps = PorterStemmer()
            X = np.array([ps.stem(i) for i in X])
        return X.flatten()

#### Pipeline

In [206]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# This is IMPORTANT. This will save the fitted transformers in the model
emailTransformer = EmailTransformer(remove_punct=True)
countVect = CountVectorizer(lowercase=False)
tfidf = TfidfTransformer()

pipeline = Pipeline([
    ("preprocess",  emailTransformer),
    ("count", countVect),
    ("TFIDF", tfidf),
    ("clf", RandomForestClassifier())
])

In [207]:
pipeline.fit(train_data['text'], train_label)



Pipeline(memory=None,
         steps=[('preprocess',
                 EmailTransformer(remove_punct=True, replace_num=False,
                                  replace_url=False, stemming=False)),
                ('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=False, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                              

In [208]:
from sklearn.metrics import f1_score
y_pred = pipeline.predict(train_data['text'])
print("Training F1:", f1_score(train_label, y_pred))

Training F1: 0.9935602575896963


In [209]:
test_pred = pipeline.predict(test_data['text'])
print("Test F1:", f1_score(test_label, test_pred))

Test F1: 0.8809523809523809


#### Fine Tune with GridSearch

In [210]:
from sklearn.model_selection import GridSearchCV

clf = Pipeline([
    ("preprocess",  emailTransformer),
    ("count", countVect),
    ("TFIDF", tfidf),
    ("clf", RandomForestClassifier())
])

param_grid = [
    {"preprocess__remove_punct": [True, False]},
    {"preprocess__replace_url": [True, False]},
    {"preprocess__replace_num": [True, False]},
    {"preprocess__stemming": [True, False]},
    {"count__lowercase": [True, False]}
]


grid_search = GridSearchCV(clf, param_grid, cv=5, return_train_score=True)

grid_search.fit(train_data['text'], train_label)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] preprocess__remove_punct=True ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  preprocess__remove_punct=True, score=(train=0.997, test=0.950), total=   2.0s
[CV] preprocess__remove_punct=True ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s remaining:    0.0s


[CV]  preprocess__remove_punct=True, score=(train=0.997, test=0.924), total=   2.0s
[CV] preprocess__remove_punct=True ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.2s remaining:    0.0s


[CV]  preprocess__remove_punct=True, score=(train=0.998, test=0.925), total=   2.1s
[CV] preprocess__remove_punct=True ...................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.1s remaining:    0.0s


[CV]  preprocess__remove_punct=True, score=(train=0.999, test=0.939), total=   1.8s
[CV] preprocess__remove_punct=True ...................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   14.2s remaining:    0.0s


[CV]  preprocess__remove_punct=True, score=(train=0.998, test=0.934), total=   2.1s
[CV] preprocess__remove_punct=False ..................................




[CV]  preprocess__remove_punct=False, score=(train=0.999, test=0.950), total=   1.0s
[CV] preprocess__remove_punct=False ..................................




[CV]  preprocess__remove_punct=False, score=(train=0.995, test=0.936), total=   0.9s
[CV] preprocess__remove_punct=False ..................................




[CV]  preprocess__remove_punct=False, score=(train=0.998, test=0.931), total=   0.9s
[CV] preprocess__remove_punct=False ..................................




[CV]  preprocess__remove_punct=False, score=(train=0.997, test=0.932), total=   1.0s
[CV] preprocess__remove_punct=False ..................................




[CV]  preprocess__remove_punct=False, score=(train=0.997, test=0.943), total=   1.0s
[CV] preprocess__replace_url=True ....................................




[CV]  preprocess__replace_url=True, score=(train=0.998, test=0.952), total=   2.7s
[CV] preprocess__replace_url=True ....................................




[CV]  preprocess__replace_url=True, score=(train=0.996, test=0.931), total=   2.9s
[CV] preprocess__replace_url=True ....................................




[CV]  preprocess__replace_url=True, score=(train=0.998, test=0.940), total=   2.8s
[CV] preprocess__replace_url=True ....................................




[CV]  preprocess__replace_url=True, score=(train=0.998, test=0.943), total=   2.3s
[CV] preprocess__replace_url=True ....................................




[CV]  preprocess__replace_url=True, score=(train=0.998, test=0.940), total=   2.7s
[CV] preprocess__replace_url=False ...................................




[CV]  preprocess__replace_url=False, score=(train=0.999, test=0.953), total=   2.2s
[CV] preprocess__replace_url=False ...................................




[CV]  preprocess__replace_url=False, score=(train=0.996, test=0.933), total=   2.1s
[CV] preprocess__replace_url=False ...................................




[CV]  preprocess__replace_url=False, score=(train=0.999, test=0.945), total=   2.1s
[CV] preprocess__replace_url=False ...................................




[CV]  preprocess__replace_url=False, score=(train=0.998, test=0.948), total=   1.8s
[CV] preprocess__replace_url=False ...................................




[CV]  preprocess__replace_url=False, score=(train=0.998, test=0.936), total=   2.2s
[CV] preprocess__replace_num=True ....................................




[CV]  preprocess__replace_num=True, score=(train=0.998, test=0.948), total=   3.0s
[CV] preprocess__replace_num=True ....................................




[CV]  preprocess__replace_num=True, score=(train=0.997, test=0.931), total=   2.9s
[CV] preprocess__replace_num=True ....................................




[CV]  preprocess__replace_num=True, score=(train=0.997, test=0.945), total=   2.9s
[CV] preprocess__replace_num=True ....................................




[CV]  preprocess__replace_num=True, score=(train=0.998, test=0.941), total=   2.7s
[CV] preprocess__replace_num=True ....................................




[CV]  preprocess__replace_num=True, score=(train=0.998, test=0.944), total=   2.8s
[CV] preprocess__replace_num=False ...................................




[CV]  preprocess__replace_num=False, score=(train=0.997, test=0.941), total=   1.9s
[CV] preprocess__replace_num=False ...................................




[CV]  preprocess__replace_num=False, score=(train=0.997, test=0.948), total=   1.9s
[CV] preprocess__replace_num=False ...................................




[CV]  preprocess__replace_num=False, score=(train=0.999, test=0.943), total=   2.1s
[CV] preprocess__replace_num=False ...................................




[CV]  preprocess__replace_num=False, score=(train=0.996, test=0.940), total=   1.9s
[CV] preprocess__replace_num=False ...................................




[CV]  preprocess__replace_num=False, score=(train=0.998, test=0.961), total=   2.2s
[CV] preprocess__stemming=True .......................................




[CV]  preprocess__stemming=True, score=(train=0.998, test=0.948), total=   3.0s
[CV] preprocess__stemming=True .......................................




[CV]  preprocess__stemming=True, score=(train=0.997, test=0.933), total=   3.4s
[CV] preprocess__stemming=True .......................................




[CV]  preprocess__stemming=True, score=(train=0.997, test=0.936), total=   2.9s
[CV] preprocess__stemming=True .......................................




[CV]  preprocess__stemming=True, score=(train=0.997, test=0.948), total=   2.6s
[CV] preprocess__stemming=True .......................................




[CV]  preprocess__stemming=True, score=(train=0.997, test=0.933), total=   3.0s
[CV] preprocess__stemming=False ......................................




[CV]  preprocess__stemming=False, score=(train=0.998, test=0.944), total=   1.9s
[CV] preprocess__stemming=False ......................................




[CV]  preprocess__stemming=False, score=(train=0.998, test=0.936), total=   1.9s
[CV] preprocess__stemming=False ......................................




[CV]  preprocess__stemming=False, score=(train=0.999, test=0.942), total=   1.9s
[CV] preprocess__stemming=False ......................................




[CV]  preprocess__stemming=False, score=(train=0.998, test=0.941), total=   1.9s
[CV] preprocess__stemming=False ......................................




[CV]  preprocess__stemming=False, score=(train=0.998, test=0.955), total=   2.2s
[CV] count__lowercase=True ...........................................




[CV]  count__lowercase=True, score=(train=0.998, test=0.947), total=   2.1s
[CV] count__lowercase=True ...........................................




[CV]  count__lowercase=True, score=(train=0.999, test=0.940), total=   2.0s
[CV] count__lowercase=True ...........................................




[CV]  count__lowercase=True, score=(train=0.998, test=0.942), total=   2.1s
[CV] count__lowercase=True ...........................................




[CV]  count__lowercase=True, score=(train=0.998, test=0.942), total=   1.9s
[CV] count__lowercase=True ...........................................




[CV]  count__lowercase=True, score=(train=0.998, test=0.956), total=   2.4s
[CV] count__lowercase=False ..........................................




[CV]  count__lowercase=False, score=(train=0.998, test=0.937), total=   2.9s
[CV] count__lowercase=False ..........................................




[CV]  count__lowercase=False, score=(train=0.998, test=0.928), total=   2.3s
[CV] count__lowercase=False ..........................................




[CV]  count__lowercase=False, score=(train=0.999, test=0.948), total=   2.5s
[CV] count__lowercase=False ..........................................




[CV]  count__lowercase=False, score=(train=0.997, test=0.950), total=   2.9s
[CV] count__lowercase=False ..........................................




[CV]  count__lowercase=False, score=(train=0.997, test=0.943), total=   2.0s


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  3.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        EmailTransformer(remove_punct=True,
                                                         replace_num=False,
                                                         replace_url=False,
                                                         stemming=False)),
                                       ('count',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                           

In [211]:
grid_search.best_params_

{'preprocess__replace_num': False}

In [215]:
import joblib
joblib.dump(grid_search.best_estimator_, 'models/spam_filter_chapter_3.pkl')

['models/spam_filter_chapter_3.pkl']

In [214]:
y_pred = grid_search.best_estimator_.predict(train_data['text'])
print("Training F1:", f1_score(train_label, y_pred))
test_pred = grid_search.best_estimator_.predict(test_data['text'])
print("Test F1:", f1_score(test_label, test_pred))

Training F1: 0.9949471750114837
Test F1: 0.8399168399168399
