In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 

In [None]:
# helper methods:

def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))

def fit(t, x, y):
    """Return the predicted y-value for each x-value"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a * t.column(x) + b


## Residuals

In [None]:
Image("resid.png", width=700, height=150)

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )


In [None]:
# function to calculate residuals

def residual(table, x, y):
    return table.column(y) - fit(table, x, y)


In [None]:
heights = heights.with_columns(
        'Fitted Value', fit(heights, 'MidParent', 'Child'),
        'Residual', residual(heights, 'MidParent', 'Child')
    )
heights

In [None]:
def scatter_fit(table, x, y):
    table.scatter(x, y, s=15)
    plots.plot(table.column(x), fit(table, x, y), lw=4, color='gold')
    plots.xlabel(x)
    plots.ylabel(y)
    
scatter_fit(heights, 'MidParent', 'Child')    

In [None]:
# A residual plot: plotting the residuals against the predictor variable (midparent height)

def residual_plot(table, x, y):
    x_array = table.column(x)
    t = Table().with_columns(
            x, x_array,
            'residuals', residual(table, x, y)
        )
    t.scatter(x, 'residuals', color='r')
    xlims = make_array(min(x_array), max(x_array))
    plots.plot(xlims, make_array(0, 0), color='darkblue', lw=4)
    plots.title('Residual Plot')


In [None]:
residual_plot(heights, 'MidParent', 'Child')


In [None]:
Image("resid_plot.png", width=700, height=150)

## What issues can residual plots detect?

In [None]:
# Discussion question
Image("dugong.png", width=700, height=150)

In [None]:
# ages are estimates
dugong = Table.read_table('http://www.statsci.org/data/oz/dugongs.txt')
dugong = dugong.move_to_start('Length')
dugong.show()

In [None]:
dugong.scatter('Length', 'Age')

In [None]:
# Length is easy to measure. You know the length, predict the age

correlation(dugong, 'Length', 'Age')

In [None]:
#helper method

def regression_diagnostic_plots(table, x, y):
    scatter_fit(table, x, y)
    residual_plot(table, x, y)


regression_diagnostic_plots(dugong, 'Length', 'Age')



In [None]:
Image("17_1.png", width=800, height=400)

In [None]:
# How to fit a different shape curve

def dugong_mse(a, b, c):
    x = dugong.column('Length')
    y = dugong.column('Age')
    fitted = a*x**2+b*x+c
    return np.mean((y - fitted) ** 2)

In [None]:
coefficients = minimize(dugong_mse)
coefficients

In [None]:
def fit_quadratic(x):
    return coefficients[0]*x**2+coefficients[1]*x+coefficients[2]

dugong = dugong.select('Length', 'Age')
dugong = dugong.with_column('Quadratic Fit', dugong.apply(fit_quadratic, 'Length'))
dugong.scatter(0)

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
regression_diagnostic_plots(hybrid, 'acceleration', 'mpg')


In [None]:
Image("17_2.png", width=400, height=200)

In [None]:
# What does it mean? Predictions are not equally accurate for different values of acceleration 

In [None]:
#Residual Plots are Flat Overall

residual_plot(heights, 'MidParent', 'Child')
correlation(heights, 'MidParent', 'Residual')

In [None]:
# The Average of the Residuals

round(np.mean(heights.column('Residual')), 10)

In [None]:
dugong = dugong.with_columns(
        'Fitted Value', fit(dugong, 'Length', 'Age'),
        'Residual', residual(dugong, 'Length', 'Age')
    )
dugong


In [None]:
residual_plot(dugong, 'Length', 'Age')
correlation(dugong, 'Length', 'Residual')

back to slides for Residual Plots are Flat Overall and Discussion Question

## Regression Model: Signal and Noise

In [None]:
def draw_and_compare(true_slope, true_int, sample_size):
    x = np.random.normal(50, 5, sample_size)
    xlims = np.array([np.min(x), np.max(x)])
    errors = np.random.normal(0, 6, sample_size)
    y = (true_slope * x + true_int) + errors
    sample = Table().with_columns('x', x, 'y', y)

    sample.scatter(0, 1)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title('True Line, and Points Created')

    sample.scatter(0, 1)
    plots.title('What We Get to See')

    sample.scatter(0, 1, fit_line=True)
    plots.title('Regression Line: Estimate of True Line')

    sample.scatter(0, 1, fit_line=True)
    plots.plot(xlims, true_slope*xlims + true_int, lw=2, color='green')
    plots.title("Regression Line and True Line")
    
draw_and_compare(2, -5, 10)