In [None]:
from datascience import *
import numpy as np
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Classification ##

In [None]:
def distance(pt1, pt2):
    """Return the distance between two points, represented as arrays"""
    return np.sqrt(sum((pt1 - pt2)**2))

def row_distance(row1, row2):
    """Return the distance between two numerical rows of a table"""
    return distance(np.array(row1), np.array(row2))

def distances(training, example):
    """Compute distance between example and every row in training.
    Return training augmented with Distance column"""
    distances = make_array()
    attributes = training.drop('Class')
    for row in attributes.rows:
        distances = np.append(distances, row_distance(row, example))
    return training.with_column('Distance', distances)

def closest(training, example, k):
    """Return a table of the k closest neighbors to example"""
    return distances(training, example).sort('Distance').take(np.arange(k))

def majority_class(topk):
    """Return the class with the highest count"""
    return topk.group('Class').sort('count', descending=True).column(0).item(0)

def classify(training, example, k):
    "Return the majority class among the k nearest neighbors of example"
    return majority_class(closest(training, example, k))

In [None]:
credit = Table.read_table('credit.csv')
credit.show(3)

In [None]:
credit_payments = credit.drop('LIMIT_BAL')

In [None]:
example123 = credit_payments.drop('Class').row(123)
example123

In [None]:
classify(credit_payments.exclude(123), example123, 5)

In [None]:
credit_payments.row(123)

## Evaluation ##

In [None]:
credit_payments.num_rows

In [None]:
training_set = credit_payments.take(np.arange(500))
test_set = credit_payments.take(np.arange(500, 1000))

In [None]:
def evaluate_accuracy(training, test, k):
    """Return the proportion of correctly classified examples 
    in the test set"""
    test_attributes = test.drop('Class')
    num_correct = 0
    for i in np.arange(test.num_rows):
        c = classify(training, test_attributes.row(i), k)
        num_correct = num_correct + (c == test.column('Class').item(i))
    return num_correct / test.num_rows

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
credit_payments.column('Class')

In [None]:
shuffled = credit_payments.sample(with_replacement=False)
training_set = shuffled.take(np.arange(500))
test_set = shuffled.take(np.arange(500, 1000))

In [None]:
evaluate_accuracy(training_set, test_set, 3)

In [None]:
evaluate_accuracy(training_set, test_set, 5)

In [None]:
evaluate_accuracy(training_set, training_set, 3)

## Decisions ##

In [None]:
# np.array(list) converts list to an array
# provided all the elements of list are of the same type

n = 100
second = round(n * 0.6)
third = round(n * 0.4)

year = np.array(['Second'] * second + ['Third'] * third)
major = np.array(['Declared'] * (round(second * 0.5)) + ['Undeclared'] * (round(second * 0.5)) + \
                 ['Declared'] * (round(third * 0.8))  + ['Undeclared'] * (round(third * 0.2)))
                 
students = Table().with_columns(
    'Year', year,
    'Major', major
)
students.show(3)

In [None]:
students.pivot('Major', 'Year')

In [None]:
# Chance of junior, given that they have declared
# P(third year | declared)

32 / 62

In [None]:
# P(second year | declared)

30 / 62

In [None]:
(0.6 * 0.5) / ((0.6 * 0.5) + (0.4 * 0.8))

In [None]:
(0.001 * 1) / ((0.001 * 1) + (0.999 * 0.05))

In [None]:
n = 10000
disease = round(n * 0.001)
no_disease = round(n * 0.999)

status = np.array(['Disease'] * disease + ['No disease'] * no_disease)
result = np.array(['Test +'] * (disease) + ['Test +'] * (round(no_disease * 0.05))  + \
                 ['Test -'] * (round(no_disease * 0.95)))
                 
persons = Table().with_columns(
    'Status', status,
    'Test Result', result
)
persons.show(3)

In [None]:
persons.pivot('Test Result', 'Status')

In [None]:
500 + 9490

## Using Subjective Prior Probabilities ##

In [None]:
(0.1 * 1) / ((0.1 * 1) + (0.999 * 0.05))

In [None]:
(0.5 * 1) / ((0.5 * 1) + (0.5 * 0.05))

In [None]:
def create_population(prior_disease_prob, n):
    disease = round(n * prior_disease_prob)
    no_disease = round(n * (1 - prior_disease_prob))

    status = np.array(['Disease'] * disease  +  ['No disease'] * no_disease)
    result = np.array(['Test +'] * (disease) + ['Test +'] * (round(no_disease * 0.05))  + \
                 ['Test -'] * (round(no_disease * 0.95)))
                 
    t = Table().with_columns(
    'Status', status,
    'Test Result', result
    )
    return t.pivot('Test Result', 'Status')

In [None]:
create_population(0.5, 10000)

In [None]:
5000 / (5000 + 250)