In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

In [None]:
def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

In [None]:
def standard_units(x):
    return (x - np.average(x)) / np.std(x)

# Wine Quality

In [None]:
wine_raw = Table.read_table('winequality-red.csv')
wine_raw = wine.drop('Quality')
wine_raw.show(5)

In [None]:
wine = wine_raw.select('Class').with_columns(
    'Fixed Acidity', standard_units(wine_raw.column('Fixed Acidity')),
    'Volatile Acidity', standard_units(wine_raw.column('Volatile Acidity')),
    'Citric Acid', standard_units(wine_raw.column('Citric Acid')),
    'Residual Sugar', standard_units(wine_raw.column('Residual Sugar')),
    'Chlorides', standard_units(wine_raw.column('Chlorides')),
    'Free Sulfur Dioxide', standard_units(wine_raw.column('Free Sulfur Dioxide')),
    'Total Sulfur Dioxide', standard_units(wine_raw.column('Total Sulfur Dioxide')),
    'Density', standard_units(wine_raw.column('Density')),
    'pH', standard_units(wine_raw.column('pH')),
    'Sulphates', standard_units(wine_raw.column('Sulphates')),
    'Alcohol', standard_units(wine_raw.column('Alcohol'))
)

In [None]:
wine.show(5)

In [None]:
wine.group('Class')

In [None]:
wine.scatter('Residual Sugar', 'pH', group='Class')

In [None]:
attributes = wine.drop('Class')
attributes.show(3)

In [None]:
row_distance(attributes.row(0), attributes.row(1))

In [None]:
row_distance(attributes.row(0), attributes.row(2))

In [None]:
row_distance(attributes.row(2), attributes.row(2))

# The Classifier

In [None]:
def distances(training, example):
    """
    Compute distance between example and every row in training.
    Return training augmented with Distance column
    """
    distances = make_array()
    attributes_only = training.drop('Class')
    
    for row in attributes_only.rows:
        distances = np.append(distances, row_distance(row, example))
    
#   ^ SAME AS DOING:
#
#   for i in np.arange(attributes_only.num_rows):
#       row = attributes_only.row(i)
#       distances = np.append(distances, row_distance(row, example))
        
    return training.with_column('Distance_to_ex', distances)

In [None]:
example = attributes.row(6)
example

In [None]:
distances(wine.exclude(6), example).sort('Distance_to_ex')

In [None]:
def closest(training, example, k):
    """
    Return a table of the k closest neighbors to example
    """
    return distances(training, example).sort('Distance_to_ex').take(np.arange(k))

In [None]:
closest(wine.exclude(6), example, 5)

In [None]:
closest(wine.exclude(6), example, 5).group('Class').sort('count', descending=True)

In [None]:
def majority_class(topk):
    """
    Return the class with the highest count
    """
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

In [None]:
def classify(training, example, k):
    """
    Return the majority class among the 
    k nearest neighbors of example
    """
    return majority_class(closest(training, example, k))

In [None]:
classify(wine.exclude(6), example, 5)

In [None]:
wine.take(6)

In [None]:
new_example = attributes.row(16)
classify(wine.exclude(16), new_example, 5)

In [None]:
wine.take(16)

In [None]:
another_example = attributes.row(15)
classify(wine.exclude(15), another_example, 5)

In [None]:
wine.take(15)

## Accuracy of a Classifier ##

In [None]:
wine.num_rows

In [None]:
shuffled = wine.sample(with_replacement=False) # Randomly permute the rows
training_set = shuffled.take(np.arange(1300))
test_set  = shuffled.take(np.arange(1300, 1599))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 5)