In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

plt.style.use('fivethirtyeight')

# Classification

* Binary
* Supervised
* Classification

### Last Time

* Nearest Neighbor
* Decision boundaries
* **Train / Test split**
* KNN

# Implementing KNN

### Banknote authentication

Question: "Is a $20 bill counterfeit or legitimate?"

Features: four numerical scores from scans of bills.

Note: more than two attributes!  (we did two for easy visualization)

In [None]:
banknotes = Table.read_table('banknote.csv')
banknotes

In [None]:
color_table = Table().with_columns(
    'Class', make_array(1, 0),
    'Color', make_array('darkblue', 'gold')
)

In [None]:
banknotes = banknotes.join('Class', color_table)

In [None]:
banknotes.scatter('WaveletVar', 'WaveletCurt', colors='Color')

### Discussion Question

Imagine we use a NN classifier with these two features.  What sort of error rate might you expect on the testing data?

A) 0%

B) 1%

C) 10%

D) 100%

In [None]:
banknotes.scatter('WaveletSkew', 'Entropy', colors='Color')

## Multiple attributes

N-Dimensional space is no problem for the computer.

![nn8.png](nn8.png)

Look how much the cluster overlap has reduced!

### Approach for classification

1. Identify some attributes that you think might help you predict the answer to the question.
2. Gather a training set of examples where you know the values of the attributes as well as the correct prediction.
3. To make predictions in the future, measure the value of the attributes and then use k-nearest neighbor classification to predict the answer to the question.


### Distance in multiple dimensions

$$D=\sqrt{(x_0-x_1)^2+(y_0-y_1)^2}$$

becomes

$$D=\sqrt{(x_0-x_1)^2+(y_0-y_1)^2+(z_0-z_1)^2}$$

In [None]:
# No change needed for our distance formula

def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

# N=3 is boring.

In [None]:
wine = Table.read_table('wine.csv')

# For converting Class to binary

def is_one(x):
    if x == 1:
        return 1
    else:
        return 0
    
wine = wine.with_column('Class', wine.apply(is_one, 0))

In [None]:
wine

The first two wines are both in Class 1. To find the distance between them, we first need a table of just the attributes:

In [None]:
wine_attributes = wine.drop('Class')

In [None]:
distance(np.array(wine_attributes.row(0)), np.array(wine_attributes.row(1)))

The last wine in the table is of Class 0. Its distance from the first wine is:

In [None]:
distance(np.array(wine_attributes.row(0)), np.array(wine_attributes.row(177)))

In [None]:
wine_with_colors = wine.join('Class', color_table)

In [None]:
wine_with_colors.scatter('Flavanoids', 'Alcohol', colors='Color')

In [None]:
wine_with_colors.scatter('Alcalinity of ash', 'Ash', colors='Color')

In [None]:
wine_with_colors.scatter('Magnesium', 'Total phenols', colors='Color')

# Plan the implementation

For big projects it is helpful to sketch out a plan and make the code match that.

1. Find the closest k neighbors of point, i.e., the k wines from the training set that are most similar to point.
2. Look at the classes of those k neighbors, and take the majority vote to find the most-common class of wine. Use that as our predicted class for point.

In [None]:
def closest(training, p, k):
    ...

def majority(topkclasses):
    ...

def classify(training, p, k):
    kclosest = closest(training, p, k)
    kclosest.classes = kclosest.select('Class')
    return majority(kclosest)

## Implementation Step 1

In [None]:
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(np.array(new_point), np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

def closest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

In [None]:
special_wine = wine.drop('Class').row(0)

In [None]:
closest(wine, special_wine, 5)

## Implementation Steps 2 and 3

In [None]:
def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

In [None]:
classify(wine, special_wine, 5)

In [None]:
special_wine = wine.drop('Class').row(177)
classify(wine, special_wine, 5)

# Accuracy of the Classifier

- Split into training set and test set (sometimes called holdout)
- We will do 50/50 today
- Must be careful!  Easy to contaminate machine learning.

In [None]:
shuffled_wine = wine.sample(with_replacement=False) 
training_set = shuffled_wine.take(np.arange(89))
test_set  = shuffled_wine.take(np.arange(89, 178))

In [None]:
def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

def evaluate_accuracy(training, test, k):
    test_attributes = test.drop('Class')
    def classify_testrow(row):
        return classify(training, row, k)
    c = test_attributes.apply(classify_testrow)
    return count_equal(c, test.column('Class')) / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 5)

# Throwback Monday

Eleven year old talk!

https://medium.com/seismic-data-science/how-classification-works-51d61c675b6e