In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

plt.style.use('fivethirtyeight')

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
   "livereveal", {
       'width': 1200,
       'height': 700,
       "scroll": True,
})

# Lecture 17

### Prediction, Correlation, Regression

## Announcements

* Project is due Monday night.
* Thursday is a holiday, so no class or Programming Basics.
* I will still hold office hours on Sunday from 2 to 4pm.
* Lab 8 is due next Thursday, December 5. This will be the last assignment.

# Prediction

## Prediction Problems

* Predicting one characteristic based on another:
    - Given my height, how tall will my kid be as an adult?
    - Given my education level, what is my income?
    - Given my income, how much does my car cost?
    
* Two characteristics: one is known; one is unknown
* Have data for which we know both characteristics
* To predict, need an association

### Predict child height from height of parents

* Use mid-parent height as before.
* Observation: children of short parents tend to be short

In [None]:
galton = Table.read_table('galton.csv')

heights = Table().with_columns(
    'MidParent', galton.column('midparentHeight'),
    'Child', galton.column('childHeight')
    )

heights.scatter(0)

### Predict child height from height of parents
* For a given child, predict their height by:
    - restricting ourselves to all midparents within $\pm 0.5$ inches of the child's midparent height;
    - averaging the height of all children of those midparents.

In [None]:
def predict_child(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent.
    """
    close_points = heights.where('MidParent', are.between(parent - 0.5, parent + 0.5))
    return close_points.column('Child').mean()

heights_and_predict = heights.with_column(
    'Prediction', heights.apply(predict_child, 'MidParent')
)

heights_and_predict.scatter(0)

## Graph of Averages
A visualization of x and y pairs
* Group each x value with other nearby x values
* Average the corresponding y values for each group
* For each x value, produce one predicted y value

## Relations between two variables
* Association
* Trend
    - Positive association
    - Negative association
* Pattern
    - Any discernible "shape"
    - Linear
    - Non-Linear

## Variable relationships: hybrid cars

In [None]:
hybrid = Table.read_table('hybrid.csv')
hybrid

### Acceleration and price
* Is there an association?
* What kind of association?

In [None]:
hybrid.scatter('acceleration', 'msrp')

### Discussion Question

This scatter plot shows that:

|Option|Answer|
|---|---|
|A.|People pay more for cars that accelerate faster|
|B.|People pay more for certain cars because they accelerate faster|
|C.|People do not pay more for cars that accelerate faster|
|D.|More than one of the above|

In [None]:
hybrid.scatter('acceleration', 'msrp')

### Fuel economy and price

* Is there an association?
* What kind of association?

In [None]:
hybrid.scatter('mpg', 'msrp')

### Observations
* There is an association:
    - Are people paying more for certain cars because they want poor fuel economy?
* The association looks more curved than linear, like $~\frac{1}{x}$

### Understanding units 
* A linear change in units doesn't change the shape of the plot.
* The scale *does* change with the units.

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.with_columns(
        'km_per_liter', hybrid.column('mpg') * 0.425144,
        'eur', hybrid.column('msrp') * 0.88 
).scatter('km_per_liter', 'eur')

### Converting columns to standard units
* makes different scatterplots comparable
* allows x and y axis to be "similarly scaled"
    - both axes measure standard deviations from their means
* doesn't change shape of the scatterplot (conversion is linear)

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)

In [None]:
def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

### Standard units: hybrid cars
* For a given pair of variables:
    - which cars are average from both perspectives?
    - which cars are both well above/below average?

In [None]:
hybrid_su = standardize(hybrid.select('msrp', 'acceleration','mpg'))
hybrid_su

In [None]:
hybrid_su.scatter('mpg (su)', 'msrp (su)')

In [None]:
(
    hybrid_su
    .with_column('vehicle', hybrid.column('vehicle'))
    .where('mpg (su)', are.between(-0.2, 0.2))
    .where('msrp (su)', are.between(-0.2, 0.2))
)

In [None]:
hybrid_su.scatter('acceleration (su)', 'msrp (su)')

In [None]:
product_table = hybrid_su.select(1, 0).with_column('product',hybrid_su.column(0)*hybrid_su.column(1))
product_table

In [None]:
products = (product_table).column('product')
np.mean(products)

In [None]:
(
    hybrid_su
    .with_column('vehicle', hybrid.column('vehicle'))
    .where('acceleration (su)', are.above(2))
    .where('msrp (su)', are.above(2))
)

### Associations in standard units
* If two attributes are positively associated,
    - their high, positive values in standard units are typically seen together,
    - their low, negative values are seen together as well.
* If two attributes are negatively associated,
    - high, positive values of one are typically coupled with low, negative values of the other.
* If two attributes aren't associated, there shouldn't be pattern in their relative sizes.

## Definition: Correlation Coefficient

**Definition**: The correlation coefficient $r$ of two attributes $x$ and $y$ is the average value of the product of $x$ and $y$ when measured in standard units.

* If `x` and `y` are arrays (i.e. columns in a table): 
```
r = np.mean(x_su * y_su)
```
where `x_su` and `y_su` are `x` and `y` converted to standard units.


### Calculate the correlation coefficient $r$ for `acceleration` and `msrp`

In [None]:
hybrid_su.scatter('acceleration (su)', 'msrp (su)')
plt.axhline(color='C2', zorder=0)
plt.axvline(color='C2', zorder=0)

In [None]:
(
    hybrid_su
    .select('acceleration (su)', 'msrp (su)')
    .with_column('product of su', hybrid_su.column('acceleration (su)') * hybrid_su.column('msrp (su)'))
)

In [None]:
r = np.mean(hybrid_su.column('acceleration (su)') * hybrid_su.column('msrp (su)'))
r

## The Correlation Coefficient $r$

* Measures how clustered points are around a straight line (linear association)
* Based on standard units
* $-1 \leq r \leq 1$
    - $r = 1$: scatterplot is a line of slope 1.
    - $r = -1$: scatterplot is a line of slope -1.
* $r = 0$: no linear association; *uncorrelated*.

### Calculate the correlation coefficient $r$ for `mpg` and `msrp`

In [None]:
hybrid_su.scatter('mpg (su)', 'msrp (su)')
plt.axhline(color='C2', zorder=0)
plt.axvline(color='C2', zorder=0)

In [None]:
(
    hybrid_su
    .select('mpg (su)', 'msrp (su)')
    .with_column('product of su', hybrid_su.column('mpg (su)') * hybrid_su.column('msrp (su)'))
)

In [None]:
r = np.mean(hybrid_su.column('mpg (su)') * hybrid_su.column('msrp (su)'))
r

## Scatterplots with given correlation coefficients

In [None]:
def r_scatter(r):
    plt.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.suptitle('r = %f' %r, fontsize=14)

In [None]:
for r in np.linspace(1, -1, 7):
    r_scatter(r)

### Discussion Question
Does the following scatter plot show:

- A. Association and correlation
- B. Association but not correlation
- C. Correlation but not association
- D. Neither association nor correlation

In [None]:
x2 = Table().with_columns(
    'x', np.arange(-6, 6.1, 0.5), 
    'y', np.arange(-6, 6.1, 0.5)**2)
x2.scatter('x', 'y')

### Answer

In [None]:
standardize(x2).with_column('product', standardize(x2).column('x (su)')*standardize(x2).column('y (su)')).show()

In [None]:
products = standard_units(x2.column('x')) * standard_units(x2.column('y'))
products

In [None]:
r = np.mean(products)
r

In [None]:
plt.hist(products);

## Child height prediction, revisited
* Calculate the correlation between `midparentHeight` and `childHeight`

In [None]:
heights_su = standardize(heights)
heights_su

In [None]:
r = np.mean(heights_su.column('MidParent (su)') * heights_su.column('Child (su)'))
r

In [None]:
heights_and_predict.scatter(0)

In [None]:
heights_su.scatter(0)
plt.plot(np.arange(-3, 3, 0.1), np.arange(-3, 3, 0.1) * r);

## Using the correlation coefficient for prediction
In standard units:
* The line through $(0,0)$ with slope $r$ is called the **regression line**.
* If the association between attributes is linear, the graph of averages is approximately the regression line.
* If the line is given by $f(x) = mx + b$, then the prediction for $x$ is given by $f(x)$.

In [None]:
heights_su.scatter(0)
plt.plot(np.arange(-3, 3, 0.1), np.arange(-3, 3, 0.1) * r);

## Regression to the Mean
![image.png](attachment:image.png)

* If $r = 0.33$, and the given $x$ is 2 standard units, then:
    - The given $x$ is 2 SDs above average
    - The prediction for $y$ is 0.66 SDs above average
* This is a consequence of $r$ having magnitude less than 1.
* On average (though not for each individual), regression predicts $y$ to be closer to the mean than $x$.

### Child height prediction, original units
Approach:
1. Scale mid-parent height to standard units
2. Use the correlation coefficient to predict child height
3. Scale predicted child height from standard units back to inches

In [None]:
parent_mean = np.mean(heights.column('MidParent'))
parent_sd = np.std(heights.column('MidParent'))
child_mean = np.mean(heights.column('Child'))
child_sd = np.std(heights.column('Child'))

In [None]:
def predict_with_r(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent, 
    using linear regression.
    """
    parent_su = (parent - parent_mean) / parent_sd
    child_su = r * parent_su
    return child_su * child_sd + child_mean

In [None]:
predict_with_r(72)

In [None]:
preds = heights_and_predict.with_column('Prediction_r', heights.apply(predict_with_r, 'MidParent'))
preds.scatter(0)

### Discussion Question

A course has a midterm (average 70; standard deviation 10) and a really hard final (average 50; standard deviation 12)

If the scatter diagram comparing midterm & final scores for students looks linearly associated with correlation 0.75, then what is the predicted final exam score for a student who received a 90 on the midterm?

- A.76
- B.90
- C.68
- D.82
- E.67.5

### Answer

1. (90 - 70)/10 = 2 standard units on midterm, 
2. estimate 0.75 * 2 = 1.5 standard units on final 
3. estimated final score = 1.5 * 12 + 50 = 68 points

## Slope and Intercept

* What does the regression line look like in original units?

## Regression line equation

In original units, the regression line has this equation:

![image.png](attachment:image.png)


## Regression line equation:

* In standard units:
$$ y = r \times x $$
* In original units, 
    - where $m_x$, $m_y$ are the averages of $x$ and $y$
    - where $s_x$, $s_y$ are the standard deviations of $x$ and $y$,
$$\frac{(y - m_y)}{s_y} = r \times \frac{(x - m_x)}{s_x}$$
* This equation reworked into the point-slope form of a line:
$$(y - m_y) = \left(\frac{r\cdot s_y}{s_x}\right)(x - m_x)$$

* Or in the slope-intercept form:

$$y = \left(\frac{r\cdot s_y}{s_x}\right)x + \left(m_y - \frac{r\cdot s_y\cdot m_x}{s_x}\right)$$

## Slope and intercept

* The regression line is given by $y = mx + b$, where:
    - the slope $m$ is: $$m = r\cdot\frac{SD\ of\ y}{SD\ of\ x}$$
    - the y-intercept $b$ is: $$b = (avg\ of\ y) - m\cdot(avg\ of\ x)$$


## The regression line
![image.png](attachment:image.png)

### Predict the retail price of a hybrid with acceleration of 12.5
* For each additional unit of acceleration (kph/s), how does the price change?

In [None]:
def correlation(t, x, y):
    return np.mean(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, x, y):
    """The slope of the regression line (original units)"""
    r = correlation(t, x, y)
    return r * np.std(t.column(y)) / np.std(t.column(x))

def intercept(t, x, y):
    """The intercept of the regression line (original units)"""
    return np.mean(t.column(y)) - slope(t, x, y) * np.mean(t.column(x))


In [None]:
m = slope(hybrid, 'acceleration', 'msrp')
b = intercept(hybrid, 'acceleration', 'msrp')

y = m * 18.5 + b
y

In [None]:
hybrid.scatter('acceleration', 'msrp')
x = np.arange(7,22)
plt.plot(x, m * x + b);

In [None]:
m

### Do people choose people of similar heights as mates?
 - "Are the heights of mothers/fathers linearly associated?"
 - Compute the correlation and regression line

In [None]:
galton = Table.read_table('galton.csv')

m = slope(galton, 'father', 'mother')
b = intercept(galton, 'father', 'mother')

galton.scatter('father', 'mother')
x = np.arange(60, 80)
plt.plot(x, m*x+b);

correlation(galton, 'mother', 'father')

In [None]:
# use `fit_line=True` instead
galton.scatter('father', 'mother', fit_line=True)

## The effect of outliers on correlation
What is the correlation coefficient of $x$ and $y$ below? Note the outlier in the lower right corner.
* A) $r>0$
* B) $r=0$
* C) $r<0$

In [None]:
outlier = Table.read_table('outlier.csv')
outlier.scatter(0)

In [None]:
outlier.scatter(0, fit_line=True)
correlation(outlier, 'x', 'y')

In [None]:
without_outlier = outlier.where('y', are.above(40))
without_outlier.scatter(0, fit_line=True)
correlation(without_outlier, 'x', 'y')

## Measuring the error in prediction

How well does an arbitrary line describe the data?

In [None]:
def plot_errors(m, b, t):
    x = t.column('x')
    y = m*x + b
    t.scatter(0)
    plt.plot(x, y)
    for k in np.arange(t.num_rows):
        xk = t.column('x').item(k)
        yk = y.item(k)
        plt.plot([xk, xk], [yk, t.column('y').item(k)], c='r', linewidth=2)
    
    plt.suptitle('y = %.2f * x + %.2f' %(m, b), fontsize=18)

In [None]:
m=0.62
b=30
plot_errors(m, b, without_outlier)

## Measuring the error in estimation

* error = actual value - prediction
* Typically, some errors are positive and some negative
    - What does a positive error mean? negative?

* To measure the rough size of the errors
    - square the errors to eliminate cancellation
    - take the mean of the squared errors
    - take the square root to fix the units
    - root mean square error (RMSE)

## Calculate the root mean square error (RMSE)

In [None]:
preds = without_outlier.with_column('pred', m * without_outlier.column('x') + b)
preds = preds.with_column('diffs', preds.column('pred') - preds.column('y'))
preds = preds.with_column('sq_diffs', preds.column('diffs')**2)
preds

In [None]:
np.sqrt(np.mean(preds.column('sq_diffs')))

## Calculate the root mean square error (RMSE)

In [None]:
def rmse(pred, true):
    '''calculate the RSME of two arrays:
    pred: the array of predicted values
    true: the array of true values of the predicted attribute
    '''
    return np.sqrt(np.mean((pred - true)**2))

## The error of linear predictors

* What is the best linear prediction function among all possible lines?
* Minimize the RMSE

In [None]:
def plot_errors_multi(m, b, t, ax):
    x = t.column('x')
    y = m*x + b
    ax.scatter(t.column('x'), t.column('y'))
    #t.scatter(0)
    ax.plot(x, y)
    for k in np.arange(t.num_rows):
        xk = t.column('x').item(k)
        yk = y.item(k)
        ax.plot([xk, xk], [yk, t.column('y').item(k)], c='r', linewidth=2)
    
    error = rmse(y, t.column('y'))
    ax.set_title('y = %.2f * x + %.2f; rmse %f' %(m, b, error))
    

In [None]:
fig, axes = plt.subplots(4,2, figsize=(12,16))
k = 0
for m in np.arange(.2, .6, 0.1):
    for b in np.arange(30, 40, 5):
        plot_errors_multi(m, b, without_outlier, ax=axes[k//2, k % 2])
        k = k + 1

## Finding the best linear prediction function

Approach

1. Enumerate a large number of reasonable lines (i.e. pairs of slopes/intercepts)
2. Calculate the RMSE of each linear predictor
3. Take the slope/intercept pair with the smallest RMSE.

In [None]:
errors = make_array()
slopes = make_array()
intercepts = make_array()

for m in np.arange(-1, 1, 0.01):
    for b in np.arange(-50, 50, 0.5):
        pred = m * without_outlier.column('x') + b
        error = rmse(pred, without_outlier.column('y'))

        errors = np.append(errors, error)
        slopes = np.append(slopes, m)
        intercepts = np.append(intercepts, b)

In [None]:
# smallest
errors.min()

In [None]:
#slope corresponding to smallest error
m = slopes.item(errors.argmin())
m

In [None]:
#intercept corresponding to smallest error
b = intercepts.item(errors.argmin())
b

In [None]:
# slope/intercept of the regression line
slope(without_outlier, 'x', 'y'), intercept(without_outlier, 'x', 'y')

In [None]:
x = without_outlier.column('x')
without_outlier.scatter(0, fit_line=True)
plt.plot(x, m*x + b, linewidth=2)
plt.legend(['regression line', 'best fit']);

In [None]:
plt.scatter(slopes, intercepts, c=(errors - errors.min())/errors.max())
plt.xlabel('slopes')
plt.ylabel('intercepts')
plt.suptitle('Scatterplot of lines, colored by errors');

## Least squares line

* Minimizes the root mean squared error (rmse) among all lines
* Coincides with the regression line!
    - Regression line defined using statistical quantities
    - Line of "best fit" defined using algebra/calculus
* All equivalent names:
    - Line of “best fit”
    - Least squares line
    - Regression line

## Regression line

* Describes the "best linear fit" of a given dataset.
* Describes the linear association of two attributes, given that the data are well described by a linear relationship!
* How do we know a linear fit is a good fit?