In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 

In [None]:
def r_scatter(r):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plots.scatter(x, y)
    plots.xlim(-4, 4)
    plots.ylim(-4, 4)

# Prediction

In [None]:
# Reminder:

#http://inferentialthinking.com/notebooks/galton.csv
galton = Table.read_table('galton.csv')



heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

heights.scatter(0)


In [None]:
#prediction

def predict_child(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent.
    
    The prediction is the average height of the children 
    whose midparent height is in the range mpht plus or minus 0.5 inches.
    """
    close_points = heights.where('MidParent', are.between(parent - 0.5, parent + 0.5))
    return close_points.column('Child').mean()

heights_and_predict = heights.with_column(
    'Prediction', heights.apply(predict_child, 'MidParent')
)

heights_and_predict.scatter(0)

Goal: perfect straight line instead of the slightly wiggly yellow curve

Variables need to be related to be able to predict one from the other.


## Association

In [None]:
Image("relation.png", width=600, height=300)

In [None]:
#http://inferentialthinking.com/notebooks/hybrid.csv
hybrid = Table.read_table('hybrid.csv')
hybrid

In [None]:
#Do you expect to see an association between acceleration and price? What kind of association? 

hybrid.scatter('acceleration', 'msrp')

In [None]:
Image("image1.png", width=800, height=400)

In [None]:
#Do you expect to see an association between mpg and price? What kind of association? 

hybrid.scatter('mpg', 'msrp')

This one looks less like a line, more curved.  
When observing whether there is an association,  
and whether that association is linear,   
we only look at shape of points, not the units on the axes.  
To be more general, use standard units.

In [None]:
# helper methods:

def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers)) / np.std(any_numbers)  

def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

hybrid_su = standardize(hybrid.select('msrp', 'acceleration','mpg'))
hybrid_su

In [None]:
hybrid_su.scatter('acceleration (su)', 'msrp (su)')

In [None]:
hybrid_su.scatter('mpg (su)', 'msrp (su)')

Notice the shapes are the same, so we can do all our analysis in standard units, to be very general.

One of these still looks more like a line than the other - can we make that more precise? 

In [None]:
Image("image2.png", width=600, height=300)

In [None]:
#try out different values of r

r_scatter(0)

In [None]:
Image("image5.png", width=800, height=200)

In [None]:
Image("image6.png", width=600, height=300)

Positive association: below-average values of x usually go with below-average  
values of y and sign of the product is usually positive.

In [None]:
def correlation(t, label_x, label_y):
    return np.mean(standard_units(t.column(label_x))*standard_units(t.column(label_y)))

In [None]:
hybrid.scatter('acceleration', 'msrp')
correlation(hybrid, 'acceleration', 'msrp')

In [None]:
hybrid.scatter('mpg', 'msrp')
correlation(hybrid, 'mpg', 'msrp')

In [None]:
Image("image4.png", width=600, height=300)

In [None]:






# Important: Not correlated does not mean not related. 
# First visualize, then quantify 

We know how to make predictions regardless of correlation (as we did with Galton's data).  

Correlation tells you something about accuracy of your predictions:  
higher magnitude (absolute value) of correlation means more accurate predictions

## Regression Line

In [None]:
#Back to Galton's data

#Trying to discover equation for yellow curve - would like a perfect line

heights_and_predict.scatter(0)

In [None]:
# How strong is the linear trend?

correlation(heights_and_predict, 'MidParent', 'Child')

In [None]:
# We take our heights and convert them to standard units:

standardize(heights).scatter(0)
plots.xlim(-4, 4)
plots.ylim(-4, 4)

When converting to standard units, only axes change, shape is same

In [None]:
# Plot a natural line of symmetry, 45 degree angle (y=x). Is this a good prediction line?

standardize(heights).scatter(0)
plots.xlim(-4, 4)
plots.ylim(-4, 4)
plots.plot([-4, 4], [-4, 4], color='r', lw=2)


In [None]:
# green line: drawn at particular x value that I want to predict

standardize(heights).scatter(0)
plots.xlim(-4, 4)
plots.ylim(-4, 4)
plots.plot([-4, 4], [-4, 4], color='r', lw=2)
plots.plot([2.5, 2.5], [-4, 4], color='g', lw=2)

In [None]:
#blue line goes through centers of vertical strips - it is flatter

standardize(heights).scatter(0)
plots.xlim(-4, 4)
plots.ylim(-4, 4)
plots.plot([-4, 4], [-4, 4], color='r', lw=2)
plots.plot([2.5, 2.5], [-4, 4], color='g', lw=2)

r = correlation(heights, 0, 1)
plots.plot([-4, 4], [-4*r, 4*r], color='dodgerblue', lw=2)

Notice that blue and red lines both go through (0,0). Why is this reasonable?

What is slope of blue line? 

Positive or negative? 

More or less than one?

Blue line is called a regression line, goes through middle points

Can you find equation of a line that goes through origin with a given slope?

In [None]:
Image("image3.png", width=700, height=150)

back to slides - graph of averages

## Regression Line for Prediction

In [None]:
# Question: How does the regression line compare to our original prediction?

heights_and_predict.scatter(0)

In [None]:
r = correlation(heights_and_predict, 'MidParent', 'Child')
parent_mean = np.mean(heights.column('MidParent'))
parent_sd = np.std(heights.column('MidParent'))
child_mean = np.mean(heights.column('Child'))
child_sd = np.std(heights.column('Child'))

In [None]:
def predict_with_r(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent, 
    using linear regression.
    """
    parent_su = (parent - parent_mean) / parent_sd
    child_su = r * parent_su
    return child_su * child_sd + child_mean

In [None]:
# Parent -> child 

predict_with_r(68)

In [None]:
# Parent -> child 

predict_with_r(74)

In [None]:
# Parent -> child 

predict_with_r(60)

In [None]:
#Now with all the midparent heights

heights_and_predict.with_column(
    'Prediction with r', 
    heights_and_predict.apply(predict_with_r, 'MidParent')).scatter(0)


back to slides - slope and intercept

## Slope and Intercept

In [None]:
# How to work in original units?

def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))


In [None]:
hybrid.scatter('acceleration', 'msrp', fit_line = True)

# Make predictions with the line y=ax+b

m = slope(hybrid, 'acceleration', 'msrp')
b = intercept(hybrid, 'acceleration', 'msrp')

print(m, b)

In [None]:
# Predict the retail price of a hybrid with acceleration of 12.5

m*12.5+b

For each additional unit of acceleration (km per hour per sec), how does the price change?