1) Try to build a classifier for the MNIST dataset that achieves over 97%
accuracy on the test set. Hint: the KNeighborsClassifier works quite
well for this task; you just need to find good hyperparameter values (try
a grid search on the weights and n_neighbors hyperparameters).

In [1]:
#Load dataset first
from keras.datasets import mnist
(train_X, train_y), (test_X, test_y) = mnist.load_data()
train_X = train_X.reshape(60000, 28 * 28)
test_X = test_X.reshape(test_X.shape[0], 28 * 28)




In [11]:
#Use KNeighborsClassifier and grid search to build classifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [2, 3]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END ....n_neighbors=2, weights=uniform;, score=0.964 total time=   9.6s
[CV 2/5] END ....n_neighbors=2, weights=uniform;, score=0.963 total time=   8.4s
[CV 3/5] END ....n_neighbors=2, weights=uniform;, score=0.962 total time=   9.9s
[CV 4/5] END ....n_neighbors=2, weights=uniform;, score=0.963 total time=  11.0s
[CV 5/5] END ....n_neighbors=2, weights=uniform;, score=0.964 total time=  10.3s
[CV 1/5] END ...n_neighbors=2, weights=distance;, score=0.970 total time=   9.6s
[CV 2/5] END ...n_neighbors=2, weights=distance;, score=0.971 total time=   9.6s
[CV 3/5] END ...n_neighbors=2, weights=distance;, score=0.969 total time=   9.7s
[CV 4/5] END ...n_neighbors=2, weights=distance;, score=0.969 total time=   9.7s
[CV 5/5] END ...n_neighbors=2, weights=distance;, score=0.968 total time=   9.1s
[CV 1/5] END ....n_neighbors=3, weights=uniform;, score=0.972 total time=   9.8s
[CV 2/5] END ....n_neighbors=3, weights=uniform;,

In [12]:
grid_search.best_params_

{'n_neighbors': 3, 'weights': 'distance'}

In [13]:
grid_search.best_score_

0.9711166666666665

In [15]:
from sklearn.metrics import accuracy_score

test_X = test_X.reshape(test_X.shape[0], 28 * 28)

pred_y = grid_search.predict(test_X)
accuracy_score(test_y, pred_y)

0.9717

2. Write a function that can shift an MNIST image in any direction (left,
right, up, or down) by one pixel. Then, for each image in the training
set, create four shifted copies (one per direction) and add them to the
training set. Finally, train your best model on this expanded training set
and measure its accuracy on the test set. You should observe that your
model performs even better now! This technique of artificially growing
the training set is called data augmentation or training set expansion

In [19]:
from scipy.ndimage import shift
import numpy as np

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

X_train_augmented = [image for image in train_X]
y_train_augmented = [label for label in train_y]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(train_X, train_y):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

knn_clf = KNeighborsClassifier(**grid_search.best_params_)

knn_clf.fit(X_train_augmented, y_train_augmented)

In [21]:
pred_y = knn_clf.predict(test_X)
accuracy_score(test_y, pred_y)

0.9763

3. Tackle the Titanic dataset. A great place to start is on Kaggle.

In [5]:
# Download dataset
import os
import urllib.request

url="https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"
download_path="datasets/titanic"

if not os.path.isdir(download_path):
    os.makedirs(download_path)
    
for filename in ("train.csv", "test.csv"):
    filepath = os.path.join(download_path, filename)
    
    if not os.path.isfile(filepath):
        print("Downloading", filename)
        urllib.request.urlretrieve(url + filename, filepath)

In [6]:
#Load datasets
import pandas as pd

train_path = os.path.join(download_path, "train.csv")
test_path = os.path.join(download_path, "test.csv")

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

In [7]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# Process data
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Numerical columns
num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [9]:
# Categorical columns
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])


In [10]:
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [12]:
X_train = preprocess_pipeline.fit_transform(
    train_data[num_attribs + cat_attribs])
y_train = train_data["Survived"]



In [13]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)

In [15]:
X_test = preprocess_pipeline.transform(test_data[num_attribs + cat_attribs])
y_pred = forest_clf.predict(X_test)

In [16]:
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8137578027465668

4. Build a spam classifier

In [18]:
# Download data
import os
import tarfile
import urllib.request

root_url="http://spamassassin.apache.org/old/publiccorpus/"
ham_url = root_url + "20030228_easy_ham.tar.bz2"
spam_url = root_url + "20030228_spam.tar.bz2"
spam_path = os.path.join("datasets", "spam")

if not os.path.isdir(spam_path):
    os.makedirs(spam_path)
    
for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
    path = os.path.join(spam_path, filename)
    
    if not os.path.isfile(path):
        urllib.request.urlretrieve(url, path)
        
    tar_bz2_file = tarfile.open(path)
    tar_bz2_file.extractall(path=spam_path)
    tar_bz2_file.close()

In [23]:
import email
import email.policy

ham_dir = os.path.join(spam_path, "easy_ham")
spam_dir = os.path.join(spam_path, "spam")
ham_filenames = [name for name in sorted(os.listdir(ham_dir)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(spam_dir)) if len(name) > 20]

def load_email(dir, filename):
    with open(os.path.join(spam_path, dir, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email('easy_ham', name) for name in ham_filenames]
spam_emails = [load_email('spam', name) for name in spam_filenames]

In [26]:
# Split data

import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
import html2text

def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

def convert_html_to_text(html_content):
  h = html2text.HTML2Text()
  text = h.handle(html_content)
  return text

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return convert_html_to_text(html)


In [35]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [37]:
%pip install -q -U urlextract

Note: you may need to restart the kernel to use updated packages.


In [38]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [68]:
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        X_transformed = []
        
        for email in X:
            text = email_to_text(email) or ""
            text = text.lower()
            
            if url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
                    
            text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text) #numbers
            text = re.sub(r'\W+', ' ', text, flags=re.M) #punctuation
            word_counts = Counter(text.split()) 
            
            if stemmer is not None:
                stemmed_word_counts = Counter()
                
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                    
                word_counts = stemmed_word_counts
                
            X_transformed.append(word_counts)
            
        return np.array(X_transformed)

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
        
    def fit(self, X, y=None): # build the vocabulary
        total_count = Counter()
        
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)

        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)} # vocabulary {word : position}
        
        return self
        
    def transform(self, X, y=None): # convert word counts to vectors
        rows = []
        cols = []
        data = []
        
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [69]:
import re
from collections import Counter

X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)

vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors.toarray()

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],
       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int32)

In [70]:
vocab_transformer.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

In [71]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.984) total time=   0.2s
[CV] END ................................ score: (test=0.980) total time=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] END ................................ score: (test=0.991) total time=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s finished


0.985

In [73]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.88%
Recall: 97.89%
