## Setup: Run Before Starting

In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 
from mpl_toolkits.mplot3d import Axes3D


In [None]:
def distance(pt1, pt2):
    """The Euclidean distance between two arrays of numbers."""
    return np.sqrt(np.sum((pt1 - pt2)**2))

def distance_from_individual(attribute_table, i, p):
    """The Euclidean distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    return distance(np.array(attribute_table.row(i)), p)

def table_with_dists(training, p):
    """A copy of the training table with the Euclidean distance from each row to array p."""
    dists = make_array()
    attributes = training.drop('Class')
    for i in np.arange(training.num_rows):
        dists = np.append(dists, distance_from_individual(attributes, i , p))
    return training.with_column('Distance', dists)

def closest(training, p, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_dists(training, p)
    sorted_by_dist = with_dists.sort('Distance')
    topk = sorted_by_dist.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

In [None]:
# helper methods:

def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(t, x, y):
    """Return the predicted y-value for each x-value"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b


In [None]:
# you can ignore that 

def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)


def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))


def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

    
def show_closest(point):
    """point = array([x,y]) 
    gives the coordinates of a new point
    shown in red"""
    
    HemoGl = ckd.drop('White Blood Cell Count', 'Color')
    t = closest(HemoGl, point, 1)
    x_closest = t.row(0).item(0)
    y_closest = t.row(0).item(1)
    ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
    plt.scatter(point.item(0), point.item(1), color='red', s=30)
    plt.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2);

In [None]:
# HIDDEN
def classify_grid(training, test, k):
    c = make_array()
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = np.append(c, classify(training, make_array(test.row(i)), k))   
    return c

In [None]:
# ignore

x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 2.1, 0.1):
    for y in np.arange(-2, 2.1, 0.1):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Hemoglobin', x_array,
    'Glucose', y_array
)

In [None]:
#wine = Table.read_table('http://inferentialthinking.com/notebooks/wine.csv')
wine = Table.read_table('wine.csv')

# For converting Class to binary

def is_one(x):
    if x == 1:
        return 1
    else:
        return 0

wine = wine.with_column('Class', wine.apply(is_one, 0))

## Classification

In [None]:
# Class 1: patient has Chronic kidney disease
# Class 0: patient does not have Chronic kidney disease

ckd_full = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd_full

In [None]:
# pull out thre relevant attributes
# standardize to get them on the same scale

ckd = Table().with_columns(
    'Hemoglobin', standard_units(ckd_full.column('Hemoglobin')),
    'Glucose', standard_units(ckd_full.column('Glucose')),
    'White Blood Cell Count', standard_units(ckd_full.column('White Blood Cell Count')),
    'Class', ckd_full.column('Class')
)

ckd 

In [None]:
# Visualize:

ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

#regression: care about shape
#classification: care about separation
#is there a separation?

In [None]:
# Now let's try another pair of attributes
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')


In [None]:
# want to work with numbers only

ckd_attributes = ckd.select('Hemoglobin', 'Glucose') #, 'White Blood Cell Count')
ckd_attributes

In [None]:
ckd_attributes.row(3)

In [None]:
patient3 = np.array(ckd_attributes.row(3))
patient3

In [None]:
alice = make_array(0, 1.1) #, 0.7)
alice

In [None]:
# define a function

def distance(point1, point2):
    """Returns the Euclidean distance between point1 and point2.
    
    Each argument is an array containing the coordinates of a point."""
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
# higher-order function

def distance_from_alice(row):
    return distance(alice, np.array(row))

In [None]:
# test out our functions

distance_from_alice(patient3)

In [None]:
#we want to use this distance function on ALL rows of the table - how to do this?
ckd_attributes

In [None]:
# distance from alice to the entire table

ckd_with_distances = ckd.with_column ('Distance from Alice', ckd_attributes.apply(distance_from_alice))
ckd_with_distances

In [None]:
ckd_with_distances = ckd_with_distances.sort('Distance from Alice')
ckd_with_distances

In [None]:
alice_5_nearest_neighbors = ckd_with_distances.take(np.arange(5))
alice_5_nearest_neighbors

In [None]:
alice_neighbor_classes = alice_5_nearest_neighbors.select('Class')
alice_neighbor_classes

In [None]:
Image("circle.png", width=400, height=200)

In [None]:
def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

In [None]:
majority(alice_neighbor_classes)

In [None]:
Image("image7.png", width=800, height=200)

## Banknotes

In [None]:
#banknote = Table.read_table("http://inferentialthinking.com/notebooks/banknote.csv")
banknotes = Table.read_table("banknote.csv")
banknotes

In [None]:
banknotes.scatter('WaveletSkew', 'WaveletCurt', colors='Class')


In [None]:
banknotes.scatter('WaveletVar', 'Entropy', colors='Class')


In [None]:
ax = plt.figure(figsize=(8,8)).add_subplot(111, projection='3d')
ax.scatter(banknotes.column('WaveletSkew'), 
           banknotes.column('WaveletVar'), 
           banknotes.column('WaveletCurt'), 
           c=banknotes.column('Class'));

In [None]:
# Recall:

def distance(point1, point2):
    """Returns the Euclidean distance between point1 and point2.      
    Each argument is an array containing the coordinates of a point."""
    return np.sqrt(np.sum((point1 - point2)**2))

Image("dist3d.png", width=800, height=200)

In [None]:
# go back and re-classify Alice using 3 features (include white blood cell count)

## Wine

In [None]:
wine

#class is type of grape (cultivar)

In [None]:
wine.scatter('Flavanoids', 'Alcohol') #, colors='Class')

#then with colors

In [None]:
wine.scatter('Alcalinity of Ash', 'Ash')#, colors='Class')


In [None]:
# extract attributes
wine_attributes = wine.drop("Class")
wine_attributes.show(3)

In [None]:
distance(np.array(wine_attributes.row(0)), np.array(wine_attributes.row(1))) 
# distance takes in two arrays

In [None]:
#see how the distance compares to something in another class
wine.take(100)

In [None]:
wine.take(0)

In [None]:
distance(np.array(wine_attributes.row(0)), np.array(wine_attributes.row(100)))

In [None]:
def distance(point1, point2):
    """Returns the distance between point1 and point2
    where each argument is an array 
    consisting of the coordinates of the point"""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, new_point):
    """Returns an array of distances
    between each point in the training set
    and the new point (which is a row of attributes)"""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(np.array(new_point), np.array(row))
    return attributes.apply(distance_from_point)

def table_with_distances(training, new_point):
    """Augments the training table 
    with a column of distances from new_point"""
    return training.with_column('Distance', all_distances(training, new_point))

def closest(training, new_point, k):
    """Returns a table of the k rows of the augmented table
    corresponding to the k smallest distances"""
    with_dists = table_with_distances(training, new_point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

In [None]:
favorite_wine = wine_attributes.drop('Class').row(0)
favorite_wine

In [None]:
closest(wine, favorite_wine, 5)

In [None]:
def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, new_point, k):
    closestk = closest(training, new_point, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

In [None]:
classify(wine, favorite_wine, 3)

This is cheating. We used this wine to help us make a prediction about this wine.  
Consider a 1-NN classifier - how would our favorite wine be classified?  
How would any other wine in this table be classified?  
Does this mean that we have the perfect model to classify wines?  

back to slides

## Accuracy of Classifier

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
# creating training and testing data sets

shuffled_ckd = ckd.sample(with_replacement=False)
training = shuffled_ckd.take(np.arange(79))
testing = shuffled_ckd.take(np.arange(79, 158))

In [None]:
training.scatter('White Blood Cell Count', 'Glucose', colors='Class')
plt.xlim(-2, 6)
plt.ylim(-2, 6);

# training set looks like the population, test set probably does too

In [None]:
# ignore this code, just look at output

x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 6.1, 0.25):
    for y in np.arange(-2, 6.1, 0.25):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Glucose', x_array,
    'White Blood Cell Count', y_array
)

c = classify_grid(training.drop('Hemoglobin'), test_grid, 1)

test_grid = test_grid.with_column('Class', c)#.join('Class', color_table)
test_grid.scatter('White Blood Cell Count', 'Glucose', colors='Class', alpha=0.4, s=30)

plt.xlim(-2, 6)
plt.ylim(-2, 6);

In [None]:
# ignore this code, just look at output

test_grid = test_grid.with_column('Class', c)#.join('Class', color_table)
test_grid.scatter('White Blood Cell Count', 'Glucose', colors='Class', alpha=0.4, s=30)

plt.scatter(testing.column('White Blood Cell Count'), testing.column('Glucose'), c=testing.column('Class'), edgecolor='k')

plt.xlim(-2, 6)
plt.ylim(-2, 6);


In [None]:
#functions to help us calculate how many were classified correctly

def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

In [None]:
# Discussion Question

actual =    make_array(1, 1, 0, 0, 1)
predicted = make_array(1, 0, 1, 0, 1)

count_equal(actual, predicted)

# What is the output? Trace through functions above to find the answer. 
# A: (1, 0, 0, 1, 1)
# B: (0, 1, -1, 0, 0)
# C: 3
# D: 2
# E: (4, 4, 5, 5, 4)


back to slides for discussion questions

## Evaluating Accuracy

In [None]:
def evaluate_accuracy(training, test, k):
    
    test_attributes = test.drop('Class')
    
    def classify_testrow(row):
        return classify(training, row, k)
    
    c = test_attributes.apply(classify_testrow)
    return count_equal(c, test.column('Class')) / test.num_rows

In [None]:
wine.num_rows

In [None]:
shuffled_wine = wine.sample(with_replacement=False) 
training_wine = shuffled_wine.take(np.arange(89))
test_wine = shuffled_wine.take(np.arange(89, 178))

evaluate_accuracy(training_wine, test_wine, 11)

In [None]:
ckd = ckd.select("Class", "Hemoglobin", "Glucose")
ckd.num_rows

In [None]:
shuffled_ckd = ckd.sample(with_replacement=False) 
training_ckd = shuffled_ckd.take(np.arange(79))
test_ckd = shuffled_ckd.take(np.arange(79, 158))

evaluate_accuracy(training_ckd, test_ckd, 5)

If I run the previous cell again, will I get the same result?

In [None]:
# http://inferentialthinking.com/notebooks/breast-cancer.csv
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients

In [None]:
#training and test sets
shuffled_patients = patients.sample(683, with_replacement=False) 
training_patients = shuffled_patients.take(np.arange(342))
test_patients = shuffled_patients.take(np.arange(342, 683))

evaluate_accuracy(training_patients, test_patients, 5)