In [None]:
#: the usual imports
import babypandas as bpd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

plt.style.use('fivethirtyeight')

#:code for demonstration
def r_scatter(r):
    plt.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.suptitle('r = %f' %r, fontsize=14)

# Lecture 25

## Prediction and Correlation

# Prediction

## Prediction Problems

* Predicting one characteristic based on another:
    - Given my height, how tall will my kid be as an adult?
    - Given my education level, what is my income?
    - Given my income, how much does my car cost?
    
* Two characteristics: one is known; one is unknown
* Have data for which we know both characteristics
* To predict, need an association

### Predict child height from height of parents

* Use mid-parent height as before.
* Observation: Children of shorter parents tend to be shorter

In [None]:
galton = bpd.read_csv('data/galton.csv')
galton.plot(kind='scatter', x='midparentHeight', y='childHeight')

### Predict child height from height of parents
* For a given child, predict their height by:
    - restricting ourselves to all midparents within $\pm 0.5$ inches of the child's midparent height;
    - averaging the height of all children of those midparents.

In [None]:
def predict_child(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent.
    """
    close_points = galton[
        (galton.get('midparentHeight') <= parent + 0.5)
        &
        (galton.get('midparentHeight') >= parent - 0.5)
        ]
    return close_points.get('childHeight').mean()

with_predictions = galton.assign(
    Prediction=galton.get('midparentHeight').apply(predict_child)
)

In [None]:
ax = with_predictions.plot(kind='scatter', x='midparentHeight', y='childHeight')
with_predictions.plot(kind='scatter', x='midparentHeight', y='Prediction', ax=ax, color='C2')

## Graph of Averages
A visualization of x and y pairs
* Group each x value with other nearby x values
* Average the corresponding y values for each group
* For each x value, produce one predicted y value

## Relations between two variables
* Association
    - Positive association
    - Negative association
* Pattern
    - Any discernible "shape"
    - Linear
    - Non-Linear

## Variable relationships: hybrid cars

In [None]:
hybrid = bpd.read_csv('data/hybrid.csv')
hybrid

### Acceleration and price
* Is there an association?
* What kind of association?

In [None]:
hybrid.plot(kind='scatter', x='acceleration', y='msrp')

### Discussion Question

This scatter plot suggests that people are generally:

|Option|Answer|
|---|---|
|A.|Willing to pay more for cars that accelerate faster|
|B.|Willing to pay more for certain cars because they accelerate faster|
|C.|Not willing to pay more for cars that accelerate faster|
|D.|More than one of the above|

In [None]:
hybrid.plot(kind='scatter', x='acceleration', y='msrp')

### Fuel economy and price

* Is there an association?
* What kind of association?

In [None]:
hybrid.plot(kind='scatter', x='mpg', y='msrp')

### Observations
* There is an association:
    - Are people willing to pay more for certain cars because they want poor fuel economy?
* The association looks more curved than linear. Maybe like $~\frac{1}{x}$

### Understanding units 
* A linear change in units doesn't change the shape of the plot.
* The scale *does* change with the units.

In [None]:
hybrid.assign(
        km_per_liter=hybrid.get('mpg') * 0.425144,
        eur=hybrid.get('msrp') * 0.84 
).plot(kind='scatter', x='km_per_liter', y='eur')

In [None]:
hybrid.plot(kind='scatter', x='mpg', y='msrp')

### Converting columns to standard units
* makes different scatterplots comparable
* allows x and y axis to be "similarly scaled"
    - both axes measure standard deviations from their means
* doesn't change shape of the scatterplot (conversion is linear)

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    any_numbers = np.array(any_numbers)
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)

In [None]:
def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = bpd.DataFrame()
    for column in t.columns:
        t_su = t_su.assign(**{column + ' (su)': standard_units(t.get(column))})
    return t_su

### Standard units: hybrid cars
* For a given pair of variables:
    - which cars are average from both perspectives?
    - which cars are both well above/below average?

In [None]:
hybrid_su = standardize(hybrid.get(['msrp', 'acceleration','mpg']))
hybrid_su

In [None]:
hybrid_su.plot(kind='scatter', x='mpg (su)', y='msrp (su)')

In [None]:
(
    hybrid_su
    .assign(vehicle=hybrid.get('vehicle'))
)[
    (hybrid_su.get('mpg (su)') <= 0.2)
    &
    (hybrid_su.get('mpg (su)') >= -0.2)
    &
    (hybrid_su.get('msrp (su)') <= 0.2)
    &
    (hybrid_su.get('msrp (su)') >= -0.2)
]

In [None]:
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='msrp (su)')

In [None]:
(
    hybrid_su
    .assign(vehicle=hybrid.get('vehicle'))
)[
    (hybrid_su.get('acceleration (su)') > 2)
    &
    (hybrid_su.get('msrp (su)') > 2)
]

### Observation on associations in standard units
* If two attributes are positively associated,
    - their high, positive values in standard units are typically seen together,
    - their low, negative values are seen together as well.
* If two attributes are negatively associated,
    - high, positive values of one are typically coupled with low, negative values of the other.
* If two attributes aren't associated, there shouldn't be pattern in their relative sizes.

In [None]:
hybrid_su.plot(kind='scatter', x='mpg (su)', y='msrp (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');

In [None]:
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='msrp (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');

## Definition: Correlation Coefficient

**Definition**: The correlation coefficient $r$ of two attributes $x$ and $y$ is the average value of the product of $x$ and $y$ when measured in standard units.

* If `x` and `y` are arrays (i.e. columns in a table): 
```
r = np.mean(x_su * y_su)
```
where `x_su` and `y_su` are `x` and `y` converted to standard units.


### Calculate the $r$ for `acceleration` and `msrp`

In [None]:
(
    hybrid_su
    .get(['acceleration (su)', 'msrp (su)'])
    .assign(
        Product=hybrid_su.get('acceleration (su)') * hybrid_su.get('msrp (su)')
    )
)

In [None]:
r = (hybrid_su.get('acceleration (su)') * hybrid_su.get('msrp (su)')).mean()
r

In [None]:
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='msrp (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');

## The Correlation Coefficient $r$

* Measures how clustered points are around a straight line (linear association)
* Based on standard units
* $-1 \leq r \leq 1$
    - $r = 1$: scatterplot is a line of slope 1.
    - $r = -1$: scatterplot is a line of slope -1.
* $r = 0$: no linear association; *uncorrelated*.

### Calculate the $r$ for `mpg` and `msrp`

In [None]:
(
    hybrid_su
    .get(['mpg (su)', 'msrp (su)'])
    .assign(
        Product=hybrid_su.get('mpg (su)') * hybrid_su.get('msrp (su)')
    )
)

In [None]:
#:
r = (hybrid_su.get('mpg (su)') * hybrid_su.get('msrp (su)')).mean()
r

In [None]:
hybrid_su.plot(kind='scatter', x='mpg (su)', y='msrp (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');

## Scatterplots with given correlation coefficients

In [None]:
for r in np.linspace(1, -1, 7):
    r_scatter(r)

### Discussion Question
Does the following scatter plot show:

- A. Association and correlation
- B. Association but not correlation
- C. Correlation but not association
- D. Neither association nor correlation

In [None]:
x2 = bpd.DataFrame().assign(
    x=np.arange(-6, 6.1, 0.5), 
    y=np.arange(-6, 6.1, 0.5)**2
)
x2.plot(kind='scatter', x='x', y='y')

### Answer

In [None]:
products = standard_units(x2.get('x')) * standard_units(x2.get('y'))
products

In [None]:
np.mean(products)

In [None]:
plt.hist(products, bins=np.arange(-3.5, 3.6));

## Child height prediction, revisited
* Calculate the correlation between `midparentHeight` and `childHeight`

In [None]:
heights_su = standardize(galton.get(['midparentHeight', 'childHeight']))
heights_su

In [None]:
r = (heights_su.get('midparentHeight (su)') * heights_su.get('childHeight (su)')).mean()
r

In [None]:
ax = with_predictions.plot(kind='scatter', x='midparentHeight', y='childHeight')
with_predictions.plot(kind='scatter', x='midparentHeight', y='Prediction', ax=ax, color='C2')

In [None]:
heights_su.plot(kind='scatter', x='midparentHeight (su)', y='childHeight (su)')
plt.plot(np.arange(-3, 3, 0.1), np.arange(-3, 3, 0.1) * r, color='C2');

## Using the correlation coefficient for prediction
In standard units:
* The line through $(0,0)$ with slope $r$ is called the regression line.
* If the association between attributes is linear, the graph of averages is approximately the regression line.
* If the line is given by $f(x) = mx + b$, then the prediction for $x$ is given by $f(x)$.

In [None]:
heights_su.plot(kind='scatter', x='midparentHeight (su)', y='childHeight (su)')
plt.plot(np.arange(-3, 3, 0.1), np.arange(-3, 3, 0.1) * r, color='C2');

### Regression to the mean

* The regression line predicts:
    - Parent's whose mid-parent height is ~2 su have children with height ~0.6 su.
    - We predict that the child will be somewhat closer to average than their parents.
* This is a consequence of the slope having magnitude less than 1.

## Regression to the Mean
![image.png](attachment:image.png)

* If $r = 0.6$, and the given $x$ is 2 standard units, then:
    - The given $x$ is 2 SDs above average
    - The prediction for $y$ is 1.2 SDs above average

* On average (though not for each individual), regression predicts $y$ to be closer to the mean than $x$.

### Child height prediction, original units
Approach:
1. Scale mid-parent height to standard units
2. Use the correlation coefficient to predict child height
3. Scale predicted child height from standard units back to inches

In [None]:
#:
parent_mean = galton.get('midparentHeight').mean()
parent_sd = np.std(galton.get('midparentHeight'))
child_mean = galton.get('childHeight').mean()
child_sd = np.std(galton.get('childHeight'))

In [None]:
#:
def predict_with_r(parent):
    """Return a prediction of the height of a child 
    whose parents have a midparent height of parent, 
    using linear regression.
    """
    parent_su = (parent - parent_mean) / parent_sd
    child_su = r * parent_su
    return child_su * child_sd + child_mean

In [None]:
predict_with_r(56)

In [None]:
preds = with_predictions.assign(
    Prediction_r=galton.get('midparentHeight').apply(predict_with_r)
)
ax = preds.plot(kind='scatter', x='midparentHeight', y='childHeight')
preds.plot(kind='scatter', x='midparentHeight', y='Prediction', ax=ax, color='C2')
preds.plot(kind='scatter', x='midparentHeight', y='Prediction_r', ax=ax, color='C3')

### Discussion Question

A course has a midterm (average 70; standard deviation 10) and a really hard final (average 50; standard deviation 12)

If the scatter diagram comparing midterm & final scores for students looks linearly associated with correlation 0.75, then what is the predicted final exam score for a student who received a 90 on the midterm?

- A.76
- B.90
- C.68
- D.82
- E.67.5

### Answer

1. (90 - 70)/10 = 2 standard units on midterm, 
2. estimate 0.75 * 2 = 1.5 standard units on final 
3. estimated final score = 1.5 * 12 + 50 = 68 points