## Setup: Run Before Starting

In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 
from mpl_toolkits.mplot3d import Axes3D


In [None]:
def distance(pt1, pt2):
    """The Euclidean distance between two arrays of numbers."""
    return np.sqrt(np.sum((pt1 - pt2)**2))

def distance_from_individual(attribute_table, i, p):
    """The Euclidean distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    return distance(np.array(attribute_table.row(i)), p)

def table_with_dists(training, p):
    """A copy of the training table with the Euclidean distance from each row to array p."""
    dists = make_array()
    attributes = training.drop('Class')
    for i in np.arange(training.num_rows):
        dists = np.append(dists, distance_from_individual(attributes, i , p))
    return training.with_column('Distance', dists)

def closest(training, p, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_dists(training, p)
    sorted_by_dist = with_dists.sort('Distance')
    topk = sorted_by_dist.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

In [None]:
# helper methods:

def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(t, x, y):
    """Return the predicted y-value for each x-value"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b


In [None]:
# you can ignore that 

def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)


def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))


def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

    
def show_closest(point):
    """point = array([x,y]) 
    gives the coordinates of a new point
    shown in red"""
    
    HemoGl = ckd.drop('White Blood Cell Count', 'Color')
    t = closest(HemoGl, point, 1)
    x_closest = t.row(0).item(0)
    y_closest = t.row(0).item(1)
    ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
    plt.scatter(point.item(0), point.item(1), color='red', s=30)
    plt.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2);

In [None]:
# HIDDEN
def classify_grid(training, test, k):
    c = make_array()
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = np.append(c, classify(training, make_array(test.row(i)), k))   
    return c

In [None]:
# ignore

x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 2.1, 0.1):
    for y in np.arange(-2, 2.1, 0.1):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Hemoglobin', x_array,
    'Glucose', y_array
)

## Classification

In [None]:
# Class 1: patient has Chronic kidney disease
# Class 0: patient does not have Chronic kidney disease

ckd_full = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd_full

New patient comes along, we give them a blood test. How can we tell based on this data whether they have CKD or not?

In [None]:
# Simplification: use only Hemoglobin, Glucose and White Blood Cell Count
# Scale to use?

ckd_full.select('Hemoglobin', 'Glucose', 'White Blood Cell Count', 'Class')

In [None]:
ckd = Table().with_columns(
    'Hemoglobin', standard_units(ckd_full.column('Hemoglobin')),
    'Glucose', standard_units(ckd_full.column('Glucose')),
    'White Blood Cell Count', standard_units(ckd_full.column('White Blood Cell Count')),
    'Class', ckd_full.column('Class')
)

ckd 

In [None]:
# Visualize:

ckd.scatter('Hemoglobin', 'Glucose', colors='Class')

#regression: care about shape
#classification: care about separation
#is there a separation?

In [None]:
# Discussion question

Image("img_CKD.png", width=800, height=300)

In [None]:
alice =  make_array(-1, 0.9) #try with other values
show_closest(alice)

In [None]:
#ignore for now, just start running it because it takes some time
c = classify_grid(ckd.drop('White Blood Cell Count'), test_grid, 1)

In [None]:
test_grid.scatter('Hemoglobin', 'Glucose', color='red', alpha=0.4, s=30)

plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Class'), edgecolor='k')

plt.xlim(-2, 2)
plt.ylim(-2, 2);

In [None]:
# By eye, which red points will you be able to classify easily and where will you have trouble?

In [None]:
test_grid = test_grid.with_column('Class', c)
test_grid.scatter('Hemoglobin', 'Glucose', colors='Class', alpha=0.4, s=30)

plt.scatter(ckd.column('Hemoglobin'), ckd.column('Glucose'), c=ckd.column('Class'), edgecolor='k')

plt.xlim(-2, 2)
plt.ylim(-2, 2);

Decision Regions

In [None]:
# Now let's try another pair of attributes
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')


How to classify a point in blue cloud that happens to be right next to a yellow point?  
Might predict gold, while ignoring the fact that it is in a blue cloud.

back to slides

## Implementing the Classifier

In [None]:
ckd_full.row(0)

In [None]:
ckd_full.row(0).item(1)

In [None]:
# want to work with numbers

ckd_attributes = ckd.select('Hemoglobin', 'Glucose')
ckd_attributes

In [None]:
ckd_attributes.row(3)

In [None]:
patient3 = np.array(ckd_attributes.row(3))
patient3

# Don't use make_array

In [None]:
alice = make_array(0, 1.1)
alice

How to find distance between Alice and this other person?

back to slides

## Distance

In [None]:
distance = np.sqrt(np.sum((patient3-alice)**2))
distance

In [None]:
# define a function

def distance(point1, point2):
    """Returns the Euclidean distance between point1 and point2.
    
    Each argument is an array containing the coordinates of a point."""
    return np.sqrt(np.sum((point1 - point2)**2))

In [None]:
#Higher-order function: returns another function

def distance_from_alice(row):
    return distance(alice, np.array(row))

In [None]:
# test out our functions

distance_from_alice(patient3)

In [None]:
#we want to use this distance function on ALL rows of the table - how to do this?
ckd_attributes.show(5)

In [None]:
# the Table method apply lets us apply a function to each entry in a column of a table, but that is not exactly what we want
# look at documentation to see how we can use apply
ckd_attributes.apply?

In [None]:
# distance from alice to the entire table

ckd_with_distances = ckd.with_column ('Distance from Alice', ckd_attributes.apply(distance_from_alice))
ckd_with_distances

Reminder: what is the goal?

In [None]:
ckd_with_distances = ckd_with_distances.sort('Distance from Alice')
ckd_with_distances

In [None]:
alice_5_nearest_neighbors = ckd_with_distances.take(np.arange(5))
alice_5_nearest_neighbors

In [None]:
alice_neighbor_classes = alice_5_nearest_neighbors.select('Class')
alice_neighbor_classes

In [None]:
def majority(topkclasses):
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

In [None]:
majority(alice_neighbor_classes)

In [None]:
# visualization of Alice's 5 closest neighbors

Image("circle.png", width=400, height=150)

back to slides