In [432]:
from math import sqrt

from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import os
import pandas as pd
import random
import re

In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

def read_csv(path, file_name):
    csv_path = os.path.join(path, file_name)
    return pd.read_csv(csv_path)

def plot_number(digit):
    image = digit.reshape(28, 28)
    plt.imshow(
        image,
        cmap=matplotlib.cm.binary,
        interpolation='nearest',
    )
    plt.axis('off')
    plt.show()
    
def triple_digit_number(n):
    if n < 10:
        return '00{}'.format(n)
    elif n < 100:
        return '0{}'.format(n)
    else:
        return str(n)

def print_image(image, dimension=None):
    if dimension:
        width, height = dimension
    else:
        width = int(sqrt(len(image)))
        height = width
    for h in range(0, height):
        row = image[(h * width):((h + 1) * width)]
        print(''.join([triple_digit_number(n) for n in row]))

In [3]:
def empty_pixels(n):
    return [0 for i in range(0, n)]

def shift_image_horizontal(image, n_pixels, opts={}):
    dimension = opts.get('dimension', None)
    direction = opts.get('direction', 'left')
    
    if dimension:
        width, height = dimension
    else:
        width = int(sqrt(len(image)))
        height = width
    new_image = image.copy()
    
    for h in range(0, height):
        diff = width - n_pixels
        
        cut_idx_start = h * width
        if direction == 'left':
            cut_idx_start += n_pixels
        cut_idx_end = cut_idx_start + diff
        
        paste_idx_start = h * width
        if direction != 'left':
            paste_idx_start += n_pixels
        paste_idx_end = paste_idx_start + diff
        
        if direction == 'left':
            fill_idx_start = paste_idx_end
            fill_idx_end = paste_idx_end + n_pixels
        else:
            fill_idx_start = paste_idx_start - n_pixels
            fill_idx_end = paste_idx_start
        
        new_image[paste_idx_start:paste_idx_end] = new_image[cut_idx_start:cut_idx_end]
        new_image[fill_idx_start:fill_idx_end] = empty_pixels(n_pixels)
    return new_image

def shift_image_vertical(image, n_pixels, opts={}):
    dimension = opts.get('dimension', None)
    direction = opts.get('direction', 'down')
    
    if dimension:
        width, height = dimension
    else:
        width = int(sqrt(len(image)))
        height = width
    new_image = image.copy()
    
    size = (height - n_pixels) * width
    
    if direction == 'down':
        cut_idx_start = 0
    else:
        cut_idx_start = n_pixels * width
    cut_idx_end = cut_idx_start + size
    
    if direction == 'down':
        paste_idx_start = n_pixels * width
    else:
        paste_idx_start = 0
    paste_idx_end = paste_idx_start + size
    
    if direction == 'down':
        fill_idx_start = cut_idx_start
    else:
        fill_idx_start = paste_idx_end
    fill_idx_end = fill_idx_start + (n_pixels * width)
    
    new_image[paste_idx_start:paste_idx_end] = new_image[cut_idx_start:cut_idx_end]
    new_image[fill_idx_start:fill_idx_end] = empty_pixels(fill_idx_end - fill_idx_start)
    
    return new_image

def shift_image(image, n_pixels, opts={}):
    direction = opts.get('direction', 'left')
    if direction in ['left', 'right']:
        return shift_image_horizontal(image, n_pixels, opts)
    else:
        return shift_image_vertical(image, n_pixels, opts)

In [1132]:
from sklearn.model_selection import StratifiedShuffleSplit

def stratify(data, column, opts={}):
    stratified_split = StratifiedShuffleSplit(
        n_splits=opts.get('n_splits', 1),
        test_size=opts.get('test_size', 0.2),
        random_state=42,
    )

    gen = stratified_split.split(
        data,
        data[column],
    )
    
    training_sets = []
    test_sets = []

    for training_indices, test_indices in gen:
        training_sets.append(data.iloc[training_indices])
        test_sets.append(data.iloc[test_indices])
        
    return training_sets, test_sets

In [4]:
mnist = fetch_mldata('MNIST original', data_home='datasets')

In [5]:
X, y = mnist['data'], mnist['target']
training_idx = 60000
X_train, X_test = X[:training_idx], X[training_idx:]
y_train, y_test = y[:training_idx], y[training_idx:]

shuffle_index_train = np.random.permutation(len(X_train))
shuffle_index_test = np.random.permutation(len(X_test))

X_train, y_train = X_train[shuffle_index_train], y_train[shuffle_index_train]
X_test, y_test = X_test[shuffle_index_test], y_test[shuffle_index_test]

# 1.

Try to build a classifier for the MNIST dataset that achieves over
97% accuracy on the test set.

Hint: the `KNeighborsClassifier` works quite
well for this task; you just need to find good hyperparameter values
(try a grid search on the weights and n_neighbors hyperparameters).

## Results
- Random: 10% accuracy
- 5 n_neighbors, uniform weights: 96.88% accuracy
- 7, uniform: 96.94%
- 5, distance: 96.90999% accuracy
- 7, distance: 96.999999999999997%
- 9, distance: 96.730000000000005%
- **KNN 8, distance: 97.060000000000002%** (Winner)

### Using Augmented Data
- KNN 8 n_neighbors, distance weights = 97.28%

In [6]:
knn_clf = KNeighborsClassifier(n_neighbors=8, weights='distance')

In [None]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='distance')

In [None]:
predictions = knn_clf.predict(X_test)

In [None]:
accuracy_score(predictions, y_test)

In [None]:
param_grid = [
    {
        'n_neighbors': [10],
        'weights': ['distance'],
    },
]

knn_model = KNeighborsClassifier()
knn_grid_search = GridSearchCV(
    knn_model,
    param_grid,
    cv=2,
    scoring='accuracy',
    verbose=10,
)

In [None]:
knn_grid_search.fit(X_train, y_train)

In [None]:
knn_grid_search.best_params_
knn_grid_search.best_estimator_
knn_grid_search.cv_results_
best_model = knn_grid_search.best_estimator_
best_predictions = best_model.predict(X_test)

In [None]:
accuracy_score(best_predictions, y_test)

# 2.

Write a function that can shift an MNIST image in any direction
(left, right, up, or down) by one pixel.

Then, for each image in the training set, create four shifted
copies (one per direction) and add them to the training set.

Finally, train your best model on this expanded training set and
measure its accuracy on the test set.

You should observe that your model performs even better now!
This technique of artificially growing the training set is called
**data augmentation** or **training set expansion**.

In [None]:
X_train_augmented = []
directions = ['down', 'left', 'right', 'up']

for image in X_train:
    direction = random.choice(directions)
    new_image = shift_image(image, 1, { 'direction': direction })
    image_array = np.array([new_image])
    if len(X_train_augmented) == 0:
        X_train_augmented = image_array
    else:
        X_train_augmented = np.append(X_train_augmented, image_array, axis=0)

In [None]:
X_train_combined = np.concatenate((X_train, X_train_augmented))
y_train_combined = np.concatenate((y_train, y_train))

In [None]:
knn_clf.fit(X_train_combined, y_train_combined)

In [None]:
predictions = knn_clf.predict(X_test)

In [None]:
accuracy_score(predictions, y_test)

# 3.
Tackle the *Titanic* dataset.
A great place to start is on
[Kaggle](https://www.kaggle.com/c/titanic).

[Feature engineering](https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/).

The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.  On April 15, 1912, during her
maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew.
This sensational tragedy shocked the international community and led to better safety regulations for ships.

One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the
passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of
people were more likely to survive than others, such as women, children, and the upper-class.

In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In
particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy.

In [1065]:
TITANIC_PATH = 'datasets/titanic'
gender_submission = read_csv(TITANIC_PATH, 'gender_submission.csv')
train_original = read_csv(TITANIC_PATH, 'train.csv')
test_original = read_csv(TITANIC_PATH, 'test.csv')
train = train_original.copy()
test = test_original.copy()

In [1066]:
test['Survived'] = gender_submission['Survived']
combined = pd.concat([train, test])

In [1067]:
def extract_title(x):
    return re.findall('\w+\.', x)[0]

combined['Title'] = list(map(lambda x: extract_title(x), combined['Name']))
all_titles = sorted(list(set(combined['Title'])))

# train['Title'] = list(map(lambda x: extract_title(x), train['Name']))
# test['Title'] = list(map(lambda x: extract_title(x), test['Name']))

# train_title_set = set(train['Title'])
# test_title_set = set(test['Title'])
# missing_titles = [x for x in train_title_set if x not in test_title_set]
# all_titles = list(train_title_set.union(test_title_set))

In [1068]:
from collections import Counter

def extract_strings(x):
    return re.findall('[A-Z]+', x)
    
# cabin_values = list(set(train['Cabin'][train['Cabin'].notnull()].values))
# cabin_values_letters = [extract_strings(x) for x in cabin_values]
# cabin_letters = sorted(list(set([item for sublist in cabin_values_letters for item in sublist])))

import math

def deck_letter(x):
    if type(x) != str and math.isnan(x):
        return ''.join(cabin_letters)
    letters = extract_strings(x)
    most_common, num_most_common = Counter(letters).most_common(1)[0]
    return most_common

combined['Deck'] = list(map(lambda x: deck_letter(x), combined['Cabin']))
all_decks = sorted(list(set(combined['Deck'])))

# train['Deck'] = list(map(lambda x: deck_letter(x), train['Cabin']))
# test['Deck'] = list(map(lambda x: deck_letter(x), test['Cabin']))

# train_deck_set = set(train['Deck'])
# test_deck_set = set(test['Deck'])
# missing_decks = [x for x in train_deck_set if x not in test_deck_set]

In [1069]:
from sklearn.model_selection import StratifiedShuffleSplit

stratified_split = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42,
)

gen = stratified_split.split(
    combined,
    combined['Sex'],
)

for tr, te in gen:
    training_indices = tr
    test_indices = te
    training_set = combined.iloc[training_indices]
    test_set = combined.iloc[test_indices]

In [1070]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler

class DateFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.attribute_names].values
    
class CabinTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        copy = X['Cabin'].copy()
        copy = copy.fillna('Z')
        return copy.apply(self.transform_cabin).values
    
    def transform_cabin(self, x):
        return ''.join(re.findall('[A-Z]+', x))
    
class CategoryEmptyFiller(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        empty = 'EMPTY'
        copy = X.copy()
        copy[self.attribute_names] = copy[self.attribute_names].fillna(empty)
        return copy.append(
            pd.DataFrame([['EMPTY' for i in self.attribute_names]], columns=self.attribute_names)
        )

class CategoryMissingTypeFiller(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_name, missing_types):
        self.attribute_name = attribute_name
        self.missing_types = missing_types
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        copy = X.copy()
        return copy.append(
            pd.DataFrame(
                [[t] for t in self.missing_types],
                columns=[self.attribute_name],
            )
        )
    
class CategoryEmptyFillerComplete(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[:-1]
    
class CategoryMissingTypeFillerComplete(BaseEstimator, TransformerMixin):
    def __init__(self, missing_types):
        self.missing_types = missing_types
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        last_index = X.shape[0] - len(self.missing_types)
        return X[:last_index]

def cabin_pipeline():
    return Pipeline([
        ('cabin_transformer', CabinTransformer()),
        ('label_binarizer', LabelBinarizer(sparse_output=True)),
    ])

def category_pipeline(attribute):
    return Pipeline([
        ('selector', DateFrameSelector(attribute)),
        ('label_binarizer', LabelBinarizer(sparse_output=True)),
    ])

def category_pipeline_with_empty(attribute):
    return Pipeline([
        ('filler', CategoryEmptyFiller(attribute)),
        ('selector', DateFrameSelector(attribute)),
        ('label_binarizer', LabelBinarizer(sparse_output=True)),
        ('filler_complete', CategoryEmptyFillerComplete()),
    ])

def category_pipeline_with_missing_types(attribute, missing_types):
    return Pipeline([
        ('filler', CategoryMissingTypeFiller(attribute, missing_types)),
        ('selector', DateFrameSelector(attribute)),
        ('label_binarizer', LabelBinarizer(sparse_output=True)),
        ('filler_complete', CategoryMissingTypeFillerComplete(missing_types)),
    ])

def numerical_pipeline(attributes, strategy='mean'):
    return Pipeline([
        ('selector', DateFrameSelector(attributes)),
        ('mean', Imputer(strategy=strategy)),
        ('standard_scalar', StandardScaler()),
    ])

def encoder_pipelines(attributes=[]):
    return [('cat_pipe_{}'.format(attr), category_pipeline(attr)) for attr in attributes]

In [1089]:
# Name, Ticket (need text transformation)
# Deck
# Family size
# Age class
# Fare per person

categorical_attributes = ['Sex']
numerical_attributes = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

transformer_list = [
#     ('cabin_pipeline', cabin_pipeline()),
    ('numerical_attributes', numerical_pipeline(numerical_attributes)),
    ('embarked', category_pipeline_with_empty(['Embarked'])),
    ('missing_types_title', category_pipeline_with_missing_types('Title', all_titles)),
    ('missing_types_deck', category_pipeline_with_missing_types('Deck', missing_decks)),
] + encoder_pipelines(categorical_attributes)
fp = FeatureUnion(transformer_list=transformer_list)
train_prepared = fp.fit_transform(training_set)

# category_pipeline_with_missing_types('Title', all_titles).fit_transform(train)
# category_pipeline_with_missing_types('Title', all_titles).fit_transform(test)
# LabelBinarizer(sparse_output=True).fit_transform(CategoryMissingTypeFiller('Title', missing_titles).fit_transform(test)['Title'])

In [1090]:
# Benchmark
# Guessing everyone died: 61.616% accuracy
# Guessing everyone survived: 38.3838% accuracy

from sklearn.linear_model import SGDClassifier

# train_labels = train['Survived']
train_labels = training_set['Survived']
# print(list(train_labels == 0).count(True) / len(train_labels))
# print(list(train_labels == 1).count(True) / len(train_labels))

sgdc = SGDClassifier(random_state=42)
sgdc.fit(train_prepared, train_labels)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [1091]:
# SGD Classifier: 77.51%
# SGD Classifier: 85.17% w/ Deck
# SGD Classifier: 71.05% w/ Title
# --
# Stratify
# 83.21% w/ sex alone
# 80.15% w/ numerical attributes
# 77.48% w/ embarked
# 80.91% w/ title

test_prepared = fp.fit_transform(test_set)
# test_labels = gender_submission['Survived']
test_labels = test_set['Survived']

predictions = sgdc.predict(test_prepared)

from sklearn.metrics import accuracy_score

print(train_prepared.shape[0], train_prepared.shape[1])
print(test_prepared.shape[0], test_prepared.shape[1])
print(accuracy_score(test_labels, predictions))

1047 37
262 37
0.782442748092


# 4.

Build a spam classifier (a more challenging exercise):

- Download examples of spam and ham from
[public datasets](https://spamassassin.apache.org/publiccorpus/)

- Unzip the datasets and familiarize yourself with the data format

- Split the datasets into a training set and a test set

- Write a data preparation pipeline to convert each email into a
feature vector. Your preparation pipeline should transform an email
into a (sparse) vector indicating the presence or absence of each
possible word. For example, if all emails only ever contain four
words, “Hello,” “how,” “are,” “you,” then the email “Hello you Hello
Hello you” would be converted into a vector [1, 0, 0, 1] (meaning
[“Hello” is present, “how” is absent, “are” is absent, “you” is
present]), or [3, 0, 0, 2] if you prefer to count the number of
occurrences of each word.

- You may want to add hyperparameters to your preparation pipeline to
control whether or not to strip off email headers, convert each email
to lowercase, remove punctuation, replace all URLs with “URL,” replace
all numbers with “NUMBER,” or even perform stemming (i.e., trim off
word endings; there are Python libraries available to do this).

- Then try out several classifiers and see if you can build a great
spam classifier, with both high recall and high precision

In [1269]:
# Helper methods

def split_punctuation(word):
    arr = re.split('\W|\d', word)
    return list(filter(lambda x: len(x) >= 1, arr))

def split_line(line):
    arr = line.split('\t')
    if len(arr) != 2:
        return arr
    text = arr[1]
    
    [all_words.add(w.lower()) for w in split_punctuation(text)]
    return [text, arr[0]]

def flatten(arr):
    return [item for sublist in arr for item in sublist]

def read_table(path, file_name):
    file_path = os.path.join(path, file_name)
    return pd.read_table(file_path, delimiter='\n')

In [1270]:
# Load data

data_file = open('datasets/smsspam/SMSSpamCollection', 'r')
text = data_file.read()
lines = text.split('\n')
data = pd.DataFrame(data=[split_line(line) for line in lines], columns=['Text', 'Label'])[:-1]

stop_words_file = open('datasets/smsspam/terrier-stop.txt', 'r')
stop_words = stop_words_file.read().split('\n')
stop_words_dict = { word: True for word in stop_words }

In [1381]:
labels = set(data['Label'])

def valid_word_to_use(word):
    return word not in stop_words_dict and word not in labels

def clean_line(line):
    return [x.lower() for x in split_punctuation(line) if valid_word_to_use(x.lower())]

# 0. Create word dictionary
word_dictionary = set(sorted(
    [x.lower() for x in set(split_punctuation(text)) if valid_word_to_use(x.lower())]
))
print(len(word_dictionary))

# Pipeline
# 1. Lowercase
# 2. Remove punctuation
# 3. Remove stop words
# 4. Create feature vector for all words in word dictionary with count

class LineToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, word_list, total_word_count=True):
        self.total_word_count = total_word_count
        self.word_list = word_list
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        copy = X.copy()
        rows = list(map(lambda x: self.transform_row(x), copy['Text'].values))
        return rows
    
    def transform_row(self, line):
        clean = clean_line(line)
        line_dict = { word: clean.count(word) for word in clean }
        vector = [self.check_word(word, line_dict) for word in self.word_list]
        return vector
    
    def check_word(self, word, mapping):
        count = mapping.get(word, 0)
        if self.total_word_count:
            return count
        elif count >= 1:
            return 1
        return count

7464


In [1386]:
pipeline = LineToVectorTransformer(word_dictionary, total_word_count=False)

training_sets, test_sets = stratify(data, 'Label')
train = training_sets[0]
test = test_sets[0]
print(len(train), len(test))
train_prepared = pipeline.fit_transform(train)

4459 1115


In [1387]:
train_labels = train['Label']
sgdc = SGDClassifier(random_state=42)
sgdc.fit(train_prepared, train_labels)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [1388]:
test_prepared = pipeline.fit_transform(test)
predictions = sgdc.predict(test_prepared)

In [1389]:
# Total word count
# spam
# Precision: 0.9507042253521126, Recall: 0.9060402684563759
# ham
# Precision: 0.9856115107913669, Recall: 0.9927536231884058

# Only count once for each word existence
# spam
# Precision: 0.951048951048951, Recall: 0.912751677852349
# ham
# Precision: 0.9866255144032922, Recall: 0.9927536231884058

from sklearn.metrics import precision_score, recall_score

label_list = list(labels)

precision_scores = precision_score(test['Label'], predictions, average=None, labels=label_list)
recall_scores = recall_score(test['Label'], predictions, average=None, labels=label_list)

for index, label in enumerate(list(labels)):
    print(label)
    print('Precision: {}, Recall: {}'.format(precision_scores[index], recall_scores[index]))

spam
Precision: 0.951048951048951, Recall: 0.912751677852349
ham
Precision: 0.9866255144032922, Recall: 0.9927536231884058
