In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')
plt.style.use('fivethirtyeight')

# Lecture 19

### Experiment Design and Correlation

# Experiment Design

## Example: Polling

- You are conducting a campus poll:
    - Yes/No: Does Eleanor Roosevelt College exist?
- You want to estimate the proportion in population who believe "Yes".
- Need to be accurate to within $\pm$ 3%, 95% of the time.
- i.e., your 95% confidence interval should be 6% wide.
- How big of a sample do you need?

## Proportions are just means

- When polling, we'll write 1 if a person answers "Yes", 0 otherwise.
- The sample proportion in favor of A = mean of 0's and 1's
- I.e., it is the mean of random sample.
- Hence the CLT applies.

## Our strategy

1. We will poll a random sample of $n$ people
2. Compute sample mean (i.e., proportion in favor of A)
3. Compute sample standard deviation
4. Construct 95% confidence interval:
$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

## Our strategy

- We want a CI whose width is 4% or less.
- The width of our confidence interval depends on two things:
    - sample SD
    - $\sqrt{n}$
- If we know SD, we can pick $n$ to make CI the right size.
- But before polling, we don't know the sample SD (we don't have a sample!)

## Bounding the sample SD

- How big could the sample SD be?
- This is a simple calculation, but...
- Let's try to get the answer from a plot.

## Create a population

In [None]:
#: construct an array of zeros/ones
def polling_population(size, number_of_ones):
    count = np.arange(size)
    return np.where(count < number_of_ones, 1, 0)

In [None]:
polling_population(10, 3)

## Compute standard deviations

In [None]:
#: compute SD for each proportion
size = 100
proportions = np.arange(0, size+1, 10)
sds = make_array()

for proportion in proportions:
    population = polling_population(size, proportion)
    sd = np.std(population)
    sds = np.append(sds, sd)

In [None]:
Table().with_columns(
    'Proportion', proportions,
    'SD', sds
).scatter('Proportion')

## Bounding the sample SD

- The most the sample SD can be is 0.5.
- Doesn't depend on size of sample.
- True for the population, too!
- Using 0.5 in place of SD results in a conservative confidence interval.

## Constructing the 95% confidence interval

- The interval is:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

- The width is:

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}}
$$

- Use 0.5 instead of sample SD:

$$
4 \cdot \frac{0.5}{\sqrt{n}} = \frac{2}{\sqrt{n}}
$$

## Constructing the 95% confidence interval

- Upper bound on CI width: $2 / \sqrt{n}$
- Want our sample mean to be within $\pm$ 3%.
- I.e., want our CI to be smaller than 6% = .06.
- Solve for $n$:

$$
\sqrt{n} = \frac{2}{0.06}
$$

In [None]:
(2/.06)**2

## Answer

- We must poll $\approx$ 1111 people to get a 95% CI which is 6% wide.

## Discussion question

We must poll 1111 people to get a 95% CI which is 6% wide.
Suppose we instead want a CI that is 3% wide. How many people do we need to poll?

- A) 555
- B) 2222
- C) 4444
- D) 8888

## Answer: the $\sqrt{\quad}$ rule

- We need 4444 samples.
- To double our accuracy, we must *quadruple* our sample size.

## Experiment

- Let's see how accurate this is.
- Code below chooses some true proportion in favor of A. Don't peek!

In [None]:
#: don't peek
population_proportion = np.random.choice(np.arange(.3, .7, .01))

## Conduct a poll

In [None]:
#:
poll_results = sample_proportions(1111, [population_proportion, 1-population_proportion])
poll_results

## Repeat the poll

In [None]:
#: simulate 5000 polls
distribution = make_array()

for i in np.arange(5000):
    poll_results = sample_proportions(1111, [population_proportion, 1-population_proportion])
    distribution = np.append(distribution, poll_results.item(0))

## Visualize

In [None]:
#: draw histogram
Table().with_column('Sampled Proportions', distribution).hist(bins=20)

plt.scatter(population_proportion, 0, marker='^', zorder=3, s=100)
plt.plot([population_proportion - .03, population_proportion + .03], [0, 0], color='C1', linewidth=5)

In [None]:
np.count_nonzero((population_proportion - .03 <= distribution) & (distribution <= population_proportion + .03)) / 5000

In [None]:
population_proportion

# Correlation 

## Relations between two variables
* Association
* Trend
    - Positive association
    - Negative association
* Pattern
    - Any discernible "shape"
    - Linear
    - Non-Linear

## Variable relationships: hybrid cars

In [None]:
#:
hybrid = Table.read_table('hybrid.csv')
hybrid

### Acceleration and price
* Is there an association?
* What kind of association?

In [None]:
hybrid.scatter('acceleration', 'msrp')

### Discussion Question

This scatter plot shows that people are generally:

|Option|Answer|
|---|---|
|A.|Willing to pay more for cars that accelerate faster|
|B.|Willing to pay more for certain cars because they accelerate faster|
|C.|Not willing to pay more for cars that accelerate faster|
|D.|More than one of the above|

In [None]:
#:
hybrid.scatter('acceleration', 'msrp')

### Fuel economy and price

* Is there an association?
* What kind of association?

In [None]:
hybrid.scatter('mpg', 'msrp')

### Observations
* There is an association:
    - Are people more willing to pay less for certain cars because they want poor fuel economy?
* The association looks more curved than linear, like $~\frac{1}{x}$

### Understanding units 
* A linear change in units doesn't change the shape of the plot.
* The scale *does* change with the units.

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
hybrid.with_columns(
        'km_per_liter', hybrid.column('mpg') * 0.425144,
        'eur', hybrid.column('msrp') * 0.88 
).scatter('km_per_liter', 'eur')

### Converting columns to standard units
* makes different scatterplots comparable
* allows x and y axis to be "similarly scaled"
    - both axes measure standard deviations from their means
* doesn't change shape of the scatterplot (conversion is linear)

In [None]:
def standard_units(any_numbers):
    "Convert any array of numbers to standard units."
    return (any_numbers - np.mean(any_numbers))/np.std(any_numbers)

In [None]:
def standardize(t):
    """Return a table in which all columns of t are converted to standard units."""
    t_su = Table()
    for label in t.labels:
        t_su = t_su.with_column(label + ' (su)', standard_units(t.column(label)))
    return t_su

### Standard units: hybrid cars
* For a given pair of variables:
    - which cars are average from both perspectives?
    - which cars are both well above/below average?

In [None]:
hybrid_su = standardize(hybrid.select('msrp', 'acceleration','mpg'))
hybrid_su

In [None]:
hybrid_su.scatter('mpg (su)', 'msrp (su)')
plt.xlim(-3, 3)
plt.ylim(-3, 3);

In [None]:
(
    hybrid_su
    .with_column('vehicle', hybrid.column('vehicle'))
    .where('mpg (su)', are.between(-0.2, 0.2))
    .where('msrp (su)', are.between(-0.2, 0.2))
)

In [None]:
hybrid_su.scatter('acceleration (su)', 'msrp (su)')
plt.xlim(-3, 3)
plt.ylim(-3, 3);

In [None]:
(
    hybrid_su
    .with_column('vehicle', hybrid.column('vehicle'))
    .where('acceleration (su)', are.above(2))
    .where('msrp (su)', are.above(2))
)

### Observation on associations in standard units
* If two attributes are positively associated,
    - their high, positive values in standard units are typically seen together,
    - their low, negative values are seen together as well.
* If two attributes are negatively associated,
    - high, positive values of one are typically coupled with low, negative values of the other.
* If two attributes aren't associated, there shouldn't be pattern in their relative sizes.

## Definition: Correlation Coefficient

**Definition**: The correlation coefficient $r$ of two attributes $x$ and $y$ is the average value of the product of $x$ and $y$ when measured in standard units.

* If `x` and `y` are arrays (i.e. columns in a table): 
```
r = np.mean(x_su * y_su)
```
where `x_su` and `y_su` are `x` and `y` converted to standard units.


### Calculate the $r$ for `acceleration` and `msrp`

In [None]:
hybrid_su.scatter('acceleration (su)', 'msrp (su)')
plt.xlim(-3, 3)
plt.ylim(-3, 3);

In [None]:
#:
(
    hybrid_su
    .select('acceleration (su)', 'msrp (su)')
    .with_column('product of su', hybrid_su.column('acceleration (su)') * hybrid_su.column('msrp (su)'))
)

In [None]:
#:
r = np.mean(hybrid_su.column('acceleration (su)') * hybrid_su.column('msrp (su)'))
r

## The Correlation Coefficient $r$

* Measures how clustered points are around a straight line (linear association)
* Based on standard units
* $-1 \leq r \leq 1$
    - $r = 1$: scatterplot is a line of slope 1.
    - $r = -1$: scatterplot is a line of slope -1.
* $r = 0$: no linear association; *uncorrelated*.

### Calculate the $r$ for `mpg` and `msrp`

In [None]:
hybrid_su.scatter('mpg (su)', 'msrp (su)')
plt.axhline(color='C2', zorder=0)
plt.axvline(color='C2', zorder=0)

In [None]:
#:
(
    hybrid_su
    .select('mpg (su)', 'msrp (su)')
    .with_column('product of su', hybrid_su.column('mpg (su)') * hybrid_su.column('msrp (su)'))
)

In [None]:
#:
r = np.mean(hybrid_su.column('mpg (su)') * hybrid_su.column('msrp (su)'))
r

## Scatterplots with given correlation coefficients

In [None]:
#:
def r_scatter(r):
    plt.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, 1000)
    z = np.random.normal(0, 1, 1000)
    y = r*x + (np.sqrt(1-r**2))*z
    plt.scatter(x, y)
    plt.xlim(-4, 4)
    plt.ylim(-4, 4)
    plt.suptitle('r = %f' %r, fontsize=14)

In [None]:
#:
for r in np.linspace(1, -1, 7):
    r_scatter(r)

### Discussion Question
Does the following scatter plot show:

- A. Association and correlation
- B. Association but not correlation
- C. Correlation but not association
- D. Neither association nor correlation

In [None]:
x2 = Table().with_columns(
    'x', np.arange(-6, 6.1, 0.5), 
    'y', np.arange(-6, 6.1, 0.5)**2)
x2.scatter('x', 'y')

### Answer

In [None]:
products = standard_units(x2.column('x')) * standard_units(x2.column('y'))
products

In [None]:
np.mean(products)

In [None]:
plt.hist(products);

## Effects of outliers

In [None]:
def correlation(t, label_x, label_y):
    x_in_standard_units = standard_units(t.column(label_x))
    y_in_standard_units = standard_units(t.column(label_y))
    return np.mean(x_in_standard_units * y_in_standard_units)

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y', s=30, color='r')

In [None]:
correlation(outlier, 'x', 'y')

# Examples

### Hybrids

In [None]:
hybrid.scatter('mpg', 'msrp')

In [None]:
correlation(hybrid, 'mpg', 'msrp')

In [None]:
hybrid.scatter('acceleration', 'msrp')

In [None]:
correlation(hybrid, 'acceleration', 'msrp')

### Ecological Correlations

Careful!  correlation of populations does not reflect the strength of the relationship for individuals.

In [None]:
sat2014 = Table.read_table('sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')