In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y, color='darkblue', s=20)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

## Prediction ##

In [None]:
galton = Table.read_table('data/galton.csv')
galton

In [None]:
(78.5+67) / 2
# midparentHeight is some mid point between mother and father
# but leaning more heavily onto taller parent's height


In [None]:
heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

In [None]:
heights

In [None]:
heights.scatter('MidParent')

In [None]:
def predict_child(h):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of h.
    
    The prediction is the average height of the children 
    whose midparent height is in the range h plus or minus 0.5 inches.
    """
    
    close_points = heights.where('MidParent', are.between(h-0.5, h + 0.5))
    return close_points.column('Child').mean()   

In [None]:
heights_with_predictions = heights.with_column(
    'Prediction', heights.apply(predict_child, 'MidParent')
    )
heights_with_predictions

In [None]:
heights_with_predictions.scatter('MidParent')

In [None]:
# how would you describe the trend? positive/negative association?
# and pattern? clustered? sparse? linear/nonlinear?



## Association ##

In [None]:
hybrid = Table.read_table('data/hybrid.csv')

In [None]:
hybrid.show()

In [None]:
hybrid.sort('msrp', descending=True)

In [None]:
hybrid.sort('mpg', descending=True)

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.scatter('acceleration', 'mpg')

# what is the trend you see? (negative/positive)
# what is the pattern? (shape, linearity)

In [None]:
suv = hybrid.where('class', 'SUV')
#suv.num_rows
suv

In [None]:
suv.scatter('acceleration', 'msrp')

In [None]:
suv.scatter('mpg', 'msrp')

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    return (x - np.average(x)) / np.std(x)

In [None]:
Table().with_columns(
    'mpg (standard units)',  standard_units(suv.column('mpg')), 
    'msrp (standard units)', standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

In [None]:
Table().with_columns(
    'acceleration (standard units)', standard_units(suv.column('acceleration')), 
    'msrp (standard units)',         standard_units(suv.column('msrp'))
).scatter(0, 1)
plots.xlim(-3, 3)
plots.ylim(-3, 3);

## Correlation ##

In [None]:
r_scatter(0)

## Calculating $r$ ##

In [None]:
x = np.arange(1, 7, 1)
y = make_array(2, 3, 1, 5, 2, 7)
t = Table().with_columns(
        'x', x,
        'y', y
    )
t

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t = t.with_columns(
        'x (standard units)', standard_units(x),
        'y (standard units)', standard_units(y)
    )
t

In [None]:
t = t.with_columns('product of standard units', t.column(2) * t.column(3))
t

In [None]:
# r is the average of the products of the standard units

r = np.average(t.column(2) * t.column(3))
# same below
r = np.average(t.column(4))
r

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    x_in_standard_units = standard_units(t.column(x))
    y_in_standard_units = standard_units(t.column(y))
    return np.average(x_in_standard_units * y_in_standard_units)

In [None]:
correlation(t, 'x', 'y')

In [None]:
correlation(suv, 'mpg', 'msrp')

In [None]:
correlation(suv, 'acceleration', 'msrp')

In [None]:
correlation(hybrid,'acceleration','mpg')

### Switching Axes ###

In [None]:
correlation(t, 'x', 'y')

In [None]:
t.scatter('x', 'y', s=30, color='red')

In [None]:
t.scatter('y', 'x', s=30, color='red')

In [None]:
correlation(t, 'y', 'x')

### Nonlinearity ###

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y', s=30, color='r')
#nonlinear

In [None]:
correlation(nonlinear, 'x', 'y')

### Outliers ###

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')
line

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3,3, 4 ),
        'y', make_array(1, 2, 3,0, 4 )
    )
outlier.scatter('x', 'y', s=30, color='r')


In [None]:
correlation(outlier, 'x', 'y')

### Ecological Correlations ###

In [None]:
sat2014

In [None]:
sat2014 = Table.read_table('data/sat2014.csv').sort('State')
sat2014.sort('Participation Rate')

#LOW participation rate in requiring SAT scores
# tell me what is the correlation between critical reading and math
# in the states with a participation rate of 50% and under?
# compute r (correlation coefficient)
pr_50_under = sat2014.where('Participation Rate', are.below_or_equal_to(50)).scatter('Critical Reading','Math')
correlation(pr_50_under, 'Critical Reading','Math')
# is there some relationship between the states with a low participation rate?

# HIGH participation rate in requiring SAT scores
# tell me what is the correlation between critical reading and math
# in the states with a participation rate of 50% and higher?
# compute r (correlation coefficient)

# is there some relationship between the states with a low participation rate?
pr_50_under.show()

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')

In [None]:
# does this correlation coefficient tell the whole story?

# how did your SAT or standardized testing scores compare?

# is critical reading that good of a predictor of math scores?
# was it for you?
