In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 19 - Residuals

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

In [None]:
heights

In [None]:
def standard_units(any_numbers):
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)  

# t is a table; x and y are column labels

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, x, y):
    r = correlation(t, x, y)
    return r * np.std(t.column(y))/np.std(t.column(x))

def intercept(t, x, y):
    a = slope(t, x, y)
    return np.mean(t.column(y)) - a * np.mean(t.column(x))

def fitted_values(t, x, y):
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

def residuals(t, x, y):
    return t.column(y) - fitted_values(t, x, y)    

In [None]:
heights = heights.with_column(
    'Fitted Value', fitted_values(heights, 'MidParent', 'Child'),
    'Residual', residuals(heights, 'MidParent', 'Child')
)
heights

In [None]:
heights.scatter(0)

In [None]:
def plot_residuals(t, x, y):
    tbl = t.with_columns(
        'Fitted', fitted_values(t, x, y),
        'Residual', residuals(t, x, y)
    )
    tbl.select(x, y, 'Fitted').scatter(0)
    tbl.scatter(x, 'Residual')

In [None]:
plot_residuals(heights, 'MidParent', 'Child')

## Nonlinearity ##

In [None]:
dugong = Table.read_table('dugong.csv')

In [None]:
dugong.show()

In [None]:
correlation(dugong, 'Length', 'Age')

In [None]:
plot_residuals(dugong, 'Length', 'Age')

In [None]:
data = Table.read_table('us_women.csv')

In [None]:
data

In [None]:
correlation(data, 0, 1)

In [None]:
plot_residuals(data, 0, 1)

## Average of Residuals ##

In [None]:
round(np.average(residuals(dugong, 'Length', 'Age')), 6)

In [None]:
round(np.average(residuals(heights, 'MidParent', 'Child')), 6)

## A Measure of Clustering ##

In [None]:
def plot_fitted(t, x, y):
    tbl = t.select(x, y)
    tbl.with_columns('Fitted Value', fitted_values(t, x, y)).scatter(0)

In [None]:
plot_fitted(heights, 0, 1 )

In [None]:
np.std(fitted_values(heights, 0, 1)), np.std(heights.column(1))

In [None]:
np.std(fitted_values(heights, 0, 1)) / np.std(heights.column(1))

In [None]:
correlation(heights, 0, 1)

In [None]:
correlation(dugong, 0, 1)

In [None]:
np.std(fitted_values(dugong, 0, 1)) / np.std(dugong.column(1))

In [None]:
hybrid = Table.read_table('hybrid.csv')

In [None]:
hybrid

In [None]:
plot_fitted(hybrid, 3, 4)

In [None]:
correlation(hybrid, 'acceleration', 'mpg')

In [None]:
np.std(fitted_values(hybrid, 'acceleration', 'mpg'))/np.std(hybrid.column('mpg'))

No matter what the shape of the scatter plot, the SD of the fitted values is a fraction of the SD of the observed values of $y$. The fraction is |r|.

$$
\frac{\mbox{SD of fitted values}}{\mbox{SD of }y} ~=~ |r| ~~~~~~~~~~ \mbox{That is,} ~~ \mbox{SD of fitted values} = |r|\cdot \mbox{SD of }y
$$

## SD of the Residuals ##
No matter what the shape of the scatter plot, the SD of the residuals is a fraction of the SD of the observed values of $y$. The fraction is  $\sqrt{1-r^2}$.

$$
\mbox{SD of residuals} ~=~ \sqrt{1 - r^2} \cdot \mbox{SD of }y
$$

In [None]:
plot_fitted(heights, 'MidParent', 'Child')
ave_child = np.mean(heights.column('Child'))
plots.plot([64, 76], [ave_child, ave_child]);

In [None]:
np.std(residuals(heights, 'MidParent', 'Child'))

In [None]:
r = correlation(heights, 'MidParent', 'Child')
r

In [None]:
np.sqrt(1 - r**2) * np.std(heights.column('Child'))

In [None]:
np.std(residuals(hybrid, 'acceleration', 'mpg'))

In [None]:
r = correlation(hybrid, 'acceleration', 'mpg')
r

In [None]:
np.sqrt(1 - r**2)*np.std(hybrid.column('mpg'))