## Setup: Run Before Starting

In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 
from mpl_toolkits.mplot3d import Axes3D


In [None]:
def distance(pt1, pt2):
    """The Euclidean distance between two arrays of numbers."""
    return np.sqrt(np.sum((pt1 - pt2)**2))

def distance_from_individual(attribute_table, i, p):
    """The Euclidean distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    return distance(np.array(attribute_table.row(i)), p)

def table_with_dists(training, p):
    """A copy of the training table with the Euclidean distance from each row to array p."""
    dists = make_array()
    attributes = training.drop('Class')
    for i in np.arange(training.num_rows):
        dists = np.append(dists, distance_from_individual(attributes, i , p))
    return training.with_column('Distance', dists)

def closest(training, p, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_dists(training, p)
    sorted_by_dist = with_dists.sort('Distance')
    topk = sorted_by_dist.take(np.arange(k))
    return topk

def majority(topkclasses):
    """1 if the majority of the "Class" column is 1s, and 0 otherwise."""
    ones = topkclasses.where('Class', are.equal_to(1)).num_rows
    zeros = topkclasses.where('Class', are.equal_to(0)).num_rows
    if ones > zeros:
        return 1
    else:
        return 0

def classify(training, p, k):
    """Classify an example with attributes p using k-nearest neighbor classification with the given training table."""
    closestk = closest(training, p, k)
    topkclasses = closestk.select('Class')
    return majority(topkclasses)

In [None]:
# helper methods:

def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(t, x, y):
    """Return the predicted y-value for each x-value"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b


In [None]:
# you can ignore that 

def distance(point1, point2):
    """The distance between two arrays of numbers."""
    return np.sqrt(np.sum((point1 - point2)**2))

def all_distances(training, point):
    """The distance between p (an array of numbers) and the numbers in row i of attribute_table."""
    attributes = training.drop('Class')
    def distance_from_point(row):
        return distance(point, np.array(row))
    return attributes.apply(distance_from_point)


def table_with_distances(training, point):
    """A copy of the training table with the distance from each row to array p."""
    return training.with_column('Distance', all_distances(training, point))


def closest(training, point, k):
    """A table containing the k closest rows in the training table to array p."""
    with_dists = table_with_distances(training, point)
    sorted_by_distance = with_dists.sort('Distance')
    topk = sorted_by_distance.take(np.arange(k))
    return topk

    
def show_closest(point):
    """point = array([x,y]) 
    gives the coordinates of a new point
    shown in red"""
    
    HemoGl = ckd.drop('White Blood Cell Count', 'Color')
    t = closest(HemoGl, point, 1)
    x_closest = t.row(0).item(0)
    y_closest = t.row(0).item(1)
    ckd.scatter('Hemoglobin', 'Glucose', colors='Class')
    plt.scatter(point.item(0), point.item(1), color='red', s=30)
    plt.plot(make_array(point.item(0), x_closest), make_array(point.item(1), y_closest), color='k', lw=2);

In [None]:
# HIDDEN
def classify_grid(training, test, k):
    c = make_array()
    for i in range(test.num_rows):
        # Run the classifier on the ith patient in the test set
        c = np.append(c, classify(training, make_array(test.row(i)), k))   
    return c

In [None]:
# ignore

x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 2.1, 0.1):
    for y in np.arange(-2, 2.1, 0.1):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Hemoglobin', x_array,
    'Glucose', y_array
)

In [None]:
#wine = Table.read_table('http://inferentialthinking.com/notebooks/wine.csv')
wine = Table.read_table('wine.csv')

# For converting Class to binary

def is_one(x):
    if x == 1:
        return 1
    else:
        return 0

wine = wine.with_column('Class', wine.apply(is_one, 0))

In [None]:
ckd_full = Table.read_table('ckd.csv').relabeled('Blood Glucose Random', 'Glucose')
ckd = Table().with_columns(
    'Hemoglobin', standard_units(ckd_full.column('Hemoglobin')),
    'Glucose', standard_units(ckd_full.column('Glucose')),
    'White Blood Cell Count', standard_units(ckd_full.column('White Blood Cell Count')),
    'Class', ckd_full.column('Class')
)

## Accuracy of Classifier

In [None]:
ckd.scatter('White Blood Cell Count', 'Glucose', colors='Class')

In [None]:
int(ckd.num_rows/2)

In [None]:
# creating training and testing data sets

training, testing = ckd.split(79)

In [None]:
training.scatter('White Blood Cell Count', 'Glucose', colors='Class')
plt.xlim(-2, 6)
plt.ylim(-2, 6);

# training set looks like the population, test set probably does too

In [None]:
# ignore this code, just look at output

x_array = make_array()
y_array = make_array()
for x in np.arange(-2, 6.1, 0.25):
    for y in np.arange(-2, 6.1, 0.25):
        x_array = np.append(x_array, x)
        y_array = np.append(y_array, y)
        
test_grid = Table().with_columns(
    'Glucose', x_array,
    'White Blood Cell Count', y_array
)

c = classify_grid(training.drop('Hemoglobin'), test_grid, 1)

test_grid = test_grid.with_column('Class', c)#.join('Class', color_table)
test_grid.scatter('White Blood Cell Count', 'Glucose', colors='Class', alpha=0.4, s=30)

plt.xlim(-2, 6)
plt.ylim(-2, 6);

In [None]:
# ignore this code, just look at output

test_grid = test_grid.with_column('Class', c)#.join('Class', color_table)
test_grid.scatter('White Blood Cell Count', 'Glucose', colors='Class', alpha=0.4, s=30)

plt.scatter(testing.column('White Blood Cell Count'), testing.column('Glucose'), c=testing.column('Class'), edgecolor='k')

plt.xlim(-2, 6)
plt.ylim(-2, 6);


In [None]:
#functions to help us calculate how many were classified correctly

def count_zero(array):
    """Counts the number of 0's in an array"""
    return len(array) - np.count_nonzero(array)

def count_equal(array1, array2):
    """Takes two numerical arrays of equal length
    and counts the indices where the two are equal"""
    return count_zero(array1 - array2)

In [None]:
# Discussion Question

actual =    make_array(1, 1, 0, 0, 1)
predicted = make_array(1, 0, 1, 0, 1)

count_equal(actual, predicted)

# What is the output? Trace through functions above to find the answer. 
# A: (1, 0, 0, 1, 1)
# B: (0, 1, -1, 0, 0)
# C: 3
# D: 2
# E: (4, 4, 5, 5, 4)


back to slides for discussion questions

## Evaluating Accuracy

In [None]:
def evaluate_accuracy(training, test, k):
    
    test_attributes = test.drop('Class')
    
    def classify_testrow(row):
        return classify(training, row, k)
    
    c = test_attributes.apply(classify_testrow)
    return count_equal(c, test.column('Class')) / test.num_rows

In [None]:
# CKD accuracy

evaluate_accuracy(training, testing, 5)

In [None]:
training_wine, test_wine = wine.split(int(wine.num_rows/2))

evaluate_accuracy(training_wine, test_wine, 11)

If I run the previous cell again, will I get the same result?

In [None]:
# http://inferentialthinking.com/notebooks/breast-cancer.csv
patients = Table.read_table('breast-cancer.csv').drop('ID')
patients

In [None]:
training_patients, test_patients = patients.split(int(patients.num_rows/2))

evaluate_accuracy(training_patients, test_patients, 5)

# Housing

In [None]:
all_sales = Table.read_table('house.csv')
sales = all_sales.where('Bldg Type', '1Fam').where('Sale Condition', 'Normal').select(
    'SalePrice', '1st Flr SF', '2nd Flr SF', 
    'Total Bsmt SF', 'Garage Area', 
    'Wood Deck SF', 'Open Porch SF', 'Lot Area', 
    'Year Built', 'Yr Sold')
sales.sort('SalePrice')

In [None]:
train, test = sales.split(1001)
print(train.num_rows)
print(test.num_rows)

Training phase: finding the best slopes to get the equation of the regression line  

Testing phase: using the regression line to make predictions on the test set

In [None]:
train.hist(0, bins=32, unit='$')

In [None]:
train.scatter(1, 0, fit_line=True)

In [None]:
correlation(train, 0, 1)

Correlation of sales price with each other attribute:

In [None]:
rs = []
for label in sales.labels:
    rs.append(correlation(train, label, 0))
Table().with_columns('Column', train.labels, 'r', rs)

In [None]:
both_floors = sales.column(1) + sales.column(2)
correlation(sales.with_column('Both Floors', both_floors), 'SalePrice', 'Both Floors')

## Multiple Regression

In [None]:
example_row = test.drop('SalePrice').row(0)

random_slopes = np.random.normal(10, 2, len(example_row))
random_slopes

In [None]:
def predict(slopes, row):
    return sum(slopes * np.array(row))

predict(random_slopes, example_row)

In [None]:
# Compare to actual price

test.row(0).item(0)

In [None]:
# Separate attributes from labels

train_prices = train.column(0)
train_attributes = train.drop(0)

In [None]:
def rmse(slopes, attributes, prices):
    errors = []
    for i in np.arange(len(prices)):
        predicted = predict(slopes, attributes.row(i))
        actual = prices.item(i)
        errors.append((predicted - actual) ** 2)
    return np.mean(errors) ** 0.5

# higher order function
def rmse_train(slopes):
    return rmse(slopes, train_attributes, train_prices)

rmse_train(random_slopes)

In [None]:
best_slopes = minimize(rmse_train, start=random_slopes, smooth=True, array=True)
best_slopes

In [None]:
Table(train_attributes.labels).with_row(list(best_slopes)).show()

In [None]:
rmse_train(best_slopes)

In [None]:
test_prices = test.column(0)
test_attributes = test.drop(0)

def rmse_test(slopes):
    return rmse(slopes, test_attributes, test_prices)

rmse_linear = rmse_test(best_slopes)
print('Test set RMSE for multiple linear regression:', rmse_linear)

In [None]:
def fit(row):
    return sum(best_slopes * np.array(row))

test.with_column('Fitted', test.drop(0).apply(fit)).scatter('Fitted', 0)
plots.plot([0, 5e5], [0, 5e5]);

In [None]:
test.with_column('Residual', test_prices-test.drop(0).apply(fit)).scatter(0, 'Residual')
plots.plot([0, 7e5], [0, 0]);