# COMP3314 - Assignment 2

## Question 2: Spam classifier (20 Points)

### Step 1: Download dataset
Download examples of spam and ham from Apache SpamAssassin’s public datasets. Split the datasets into a training set and a test set. 

In [3]:
import os
import numpy as np
import glob
from urllib.request import urlretrieve
import tarfile
import shutil
import sklearn.utils
from sklearn.model_selection import train_test_split


def download_dataset():

    def download_url(url, dataset_dir="data"):

        tar_dir = os.path.join(dataset_dir, "tar")
        if not os.path.isdir(tar_dir):
            os.makedirs(tar_dir)

        filename = url.rsplit("/", 1)[-1]
        tarpath = os.path.join(tar_dir, filename)

        try:
            tarfile.open(tarpath)
        except:
            urlretrieve(url, tarpath)

        with tarfile.open(tarpath) as tar:
            dirname = os.path.join(dataset_dir, tar.getnames()[0])
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)
            tar.extractall(path=dataset_dir)

            cmds_path = os.path.join(dirname, "cmds")
            if os.path.isfile(cmds_path):
                os.remove(cmds_path)

        return dirname

    def load_dataset(dirpath):
        files = []
        filepaths = glob.glob(dirpath + "/*")
        for path in filepaths:
            with open(path, "rb") as f:
                byte_content = f.read()
                str_content = byte_content.decode("utf-8", errors="ignore")
                files.append(str_content)
        return files

    spam_url = "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
    easy_ham_url = "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2"
    hard_ham_dir = "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"

    spam = load_dataset(download_url(spam_url))
    easy_ham = load_dataset(download_url(easy_ham_url))
    hard_ham = load_dataset(download_url(hard_ham_dir))

    X = spam + easy_ham + hard_ham
    y = np.concatenate((
        np.ones(len(spam)),
        np.zeros(len(easy_ham) + len(hard_ham)),
    ))

    return X, y


# Download dataset.
X, y = download_dataset()

# Split dataset into training and testing sets.
X, y = sklearn.utils.shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

print(f"The number of training samples: {len(X_train)}")
print(f"The number of test samples: {len(X_test)}")

The number of training samples: 2436
The number of test samples: 610


### Step 2: Feature extraction (5 points)

Next, we are going to do some data cleaning and feature extraction.

1. Some data cleaning functions have been provided to you. You'll need to implement `lower_letters()`, `convert_num_to_word()`, and `remove_punctuation()`. These functions will convert email to lowercase, replace all numbers with "NUM", and remove punctuation.
2. Convert each email into a feature vector. Your preparation pipeline should transform an email into a (sparse) vector that indicates the presence or absence of each possible word. For example, if all emails only ever contain four words, "Hello," "how," "are," "you," then the email "Hello you Hello Hello you" would be converted into a vector [1, 0, 0, 1] (meaning ["Hello" is present, "how" is absent, "are" is absent, "you" is present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of each word. You may check sklearn's `CountVectorizer` class for reference.

In [13]:
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class EmailCleaner(BaseEstimator, TransformerMixin):

    def __init__(self,
                 no_header=True,
                 to_lowercase=True,
                 url_to_word=True,
                 num_to_word=True,
                 remove_punc=True,
                 ):
        self.no_header = no_header
        self.to_lowercase = to_lowercase
        self.url_to_word = url_to_word
        self.num_to_word = num_to_word
        self.remove_punc = remove_punc

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_cleaned = []
        for email in X:
            if self.no_header:
                email = EmailCleaner.remove_header(email)
            if self.to_lowercase:
                email = EmailCleaner.lower_letters(email)

            email_words = email.split()
            if self.url_to_word:
                email_words = EmailCleaner.convert_url_to_word(email_words)
            if self.num_to_word:
                email_words = EmailCleaner.convert_num_to_word(email_words)
            email = " ".join(email_words)
            if self.remove_punc:
                email = EmailCleaner.remove_punctuation(email)
            X_cleaned.append(email)
        return X_cleaned

    @staticmethod
    def remove_header(email):
        return email[email.index("\n\n"):]

    @staticmethod
    def is_url(s):
        url = re.match(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|"
            "[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", s)
        return url is not None

    @staticmethod
    def convert_url_to_word(words):
        for i, word in enumerate(words):
            if EmailCleaner.is_url(word):
                words[i] = "URL"
        return words

    @staticmethod
    def lower_letters(email):
        # === Your code here ===
        email = email.lower()
        return email
        # ======================

    @staticmethod
    def convert_num_to_word(words):
        # === Your code here ===
        for i, word in enumerate(words):
            if word.isdigit():
                words[i] = "NUM"
        return words
        # ======================

    @staticmethod
    def remove_punctuation(email):
        # === Your code here ===
        email = email.translate(str.maketrans('','',string.punctuation))
        return email
        # ======================

In [5]:
# Here are some unit tests to check your code.
# Your code should at least pass the following tests.
# Feel free to add more tests if you"d like.

# Check lower_letters().
src_string = "Message-Id: <LISTMANAGERSQL-25343"
dst_string = "message-id: <listmanagersql-25343"
assert EmailCleaner.lower_letters(src_string) == dst_string

# Check convert_num_to_word().
src_string = "Date: Wed, 10 Jul 2002"
src_words = src_string.split()
dst_words = ["Date:", "Wed,", "NUM", "Jul", "NUM"]
assert EmailCleaner.convert_num_to_word(src_words) == dst_words

# Check remove_punctuation().
src_string = "superstars -- you'll find investing more fun..."
dst_string = "superstars  youll find investing more fun"
assert EmailCleaner.remove_punctuation(src_string) == dst_string

In [6]:
# Step 1 of pipeline: data cleaning.
email_cleaner = EmailCleaner()

# Step 2 of pipeline: CountVectorizer.
# === Your code here ===
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words = 'english')
# ======================

# Build pipeline.
prepare_pipeline = Pipeline([
    ("email_cleaner", email_cleaner),
    ("count_vectorizer", count_vectorizer),
])

# Run preprocessing.
X_all = X_train + X_test
prepare_pipeline.fit(X_all)
X_all = prepare_pipeline.transform(X_all)
num_train = len(X_train)
X_train = X_all[:num_train]
X_test = X_all[num_train:]

print(X_train.shape)
print(X_test.shape)

(2436, 108540)
(610, 108540)


### Step 3: Train a spam classifier (5 points)

Next, let's build a spam classifier, and train your classifier with the training set.

In [8]:
# === Your code here ===
stop = stopwords.words('english')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tfidf.fit_transform(X_train)
np.set_printoptions(precision=4)
X_train_tfid = tfidf.transform(X_train)
X_test_tfid = tfidf.transform(X_test)

gs_lr_tfidf = GridSearchCV(estimator = LogisticRegression(random_state = 0, solver = 'liblinear'),
             param_grid={'C': [1,1e1,1e2,1e3,1e4,1e5,1e6,1e7,1e8,1e9,1e10]},
             scoring  = 'accuracy',
             cv = 5,
             verbose = 2,
             n_jobs = -1)

gs_lr_tfidf.fit(X_train_tfid,y_train)
# ======================

Fitting 5 folds for each of 11 candidates, totalling 55 fits


GridSearchCV(cv=5,
             estimator=LogisticRegression(random_state=0, solver='liblinear'),
             n_jobs=-1,
             param_grid={'C': [1, 10.0, 100.0, 1000.0, 10000.0, 100000.0,
                               1000000.0, 10000000.0, 100000000.0, 1000000000.0,
                               10000000000.0]},
             scoring='accuracy', verbose=2)

In [9]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' %gs_lr_tfidf.best_score_)

Best parameter set: {'C': 100000.0} 
CV Accuracy: 0.978


### Step 4: Eval your classifier

Test your classifier with the test set and print the precision and recall.

In [14]:
# === Your code here ===
from sklearn.metrics import *
def printPrecisionRecall(label, ys_test, ys_test_hat, ys_train = None, ys_train_hat = None,command = 1):
    print("%s:" %label)
    # command 1 : for step(3)
    if(command == 1):
        print("Precision: %.4f \t" % precision_score(ys_test,ys_test_hat), end = '')
        print("Recall: %.4f" % recall_score(ys_test, ys_test_hat))
    # command 2 : for step(5)
    elif(command == 2):
        print("Training| Precision: %.4f \t" % precision_score(ys_train,ys_train_hat), end = '')
        print("Recall: %.4f" % recall_score(ys_test, ys_test_hat))
        print("Testing | Precision: %.4f \t" % precision_score(ys_test,ys_test_hat), end = '')
        print("Recall: %.4f" % recall_score(ys_test,ys_test_hat))
    else:
        print("ERROR")
printPrecisionRecall("LogisticRegression", y_test, gs_lr_tfidf.predict(X_test_tfid))
# ======================

LogisticRegression:
Precision: 0.9857 	Recall: 0.9821


### Step 5: Ensemble of classifiers (5 points)

1. Implement 4 new classifiers (in total you have 5 claassifiers now).
2. Use hard or soft voting to ensemble thoses classifiers.
3. Train your ensemble model on the training set. Report training/testing precision and recall.

In [15]:
# === Your code here ===
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, vote='classlabel', weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
        
    def fit(self, X, y):
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel' ; got (vote=%r)" % self.vote)
        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers' % (len(self.weights), len(self.classifiers)))
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)       
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            maj_vote = np.apply_along_axis(
                       lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote
    
    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict

clf1 = LogisticRegression(penalty = 'l2', C = 1e5, solver = 'liblinear', random_state = 0)
clf2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy',random_state = 0)
clf3 = KNeighborsClassifier(n_neighbors = 1, p =2, metric='minkowski')
clf4 = SVC()
clf5 = RandomForestClassifier()
clf_labels = ['SLR', 'DecisionTree', 'KNN','SVM','RandomForest']

clf_list = [clf1,clf2,clf3,clf4,clf5]

mv_clf = MajorityVoteClassifier(classifiers = clf_list)
clf_labels += ["Majority voting"]
clf_all  = clf_list + [mv_clf]
for clf, label in zip(clf_all, clf_labels):
    clf.fit(X_train_tfid, y_train)
    y_train_hat = clf.predict(X_train_tfid)
    y_test_hat = clf.predict(X_test_tfid)
    printPrecisionRecall(label = label, 
                         ys_test = y_test, 
                         ys_test_hat = y_test_hat,
                         ys_train = y_train, 
                         ys_train_hat = y_train_hat,
                         command = 2)

# ======================

SLR:
Training| Precision: 1.0000 	Recall: 0.9821
Testing | Precision: 0.9857 	Recall: 0.9821
DecisionTree:
Training| Precision: 0.9701 	Recall: 0.9214
Testing | Precision: 0.9416 	Recall: 0.9214
KNN:
Training| Precision: 1.0000 	Recall: 0.9464
Testing | Precision: 0.9431 	Recall: 0.9464
SVM:
Training| Precision: 0.9982 	Recall: 0.9643
Testing | Precision: 0.9747 	Recall: 0.9643
RandomForest:
Training| Precision: 1.0000 	Recall: 0.9821
Testing | Precision: 0.9786 	Recall: 0.9821
Majority voting:
Training| Precision: 1.0000 	Recall: 0.9821
Testing | Precision: 0.9821 	Recall: 0.9821
