In [None]:
from datascience import *
import numpy as np

In [None]:
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In this lecture, I am going to use more interactive plots (they look better) so I am using the plotly.express library.  We won't test you on this but it's good to know.

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Lecture 11

In this lecture, we derive the equation for linear regression using the correlation coefficient $r$.

## Recap From Last Lecture

In the previous lecture, we introduced the correlation coefficient: 

\begin{align}
r 
& = \text{Mean}\left(\text{StandardUnits}(x) *  \text{StandardUnits}(y)\right)\\
& = \frac{1}{n} \sum_{i=1}^n \text{StandardUnits}(x_i) *  \text{StandardUnits}(y_i)\\
& = \frac{1}{n}\sum_{i=1}^n \left( \frac{x_i - \text{Mean}(x)}{\text{Stdev}(x)} \right) * \left( \frac{y_i - \text{Mean}(y)}{\text{Stdev}(y)} \right) \\
\end{align}

We implemented the correlation coefficient:

In [None]:
def standard_units(x):
    "Convert any array of numbers to standard units."
    pass    

In [None]:
def correlation(t, x, y):
    """t is a table; x and y are column labels"""
    pass

We built an intuition about the correlation coefficient using the following code which you don't need to understand:

In [None]:
def make_correlated_data(r, n=500):
    "Generate a a table with columns x and y with a correlation of approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    y = r*x + (np.sqrt(1-r**2))*z
    return Table().with_columns("x", x, "y", y)

def r_scatter(r, n=500, ax=None):
    plots.figure(figsize=(5,5))
    "Generate a scatter plot with a correlation approximately r"
    x = np.random.normal(0, 1, n)
    z = np.random.normal(0, 1, n)
    y = r*x + (np.sqrt(1-r**2))*z
    if ax:        
        ax.scatter(x, y, color='darkblue', s=20)
        ax.set_xlim(-4, 4)
        ax.set_ylim(-4, 4) 
    else:
        plots.scatter(x, y, color='darkblue', s=20)
        plots.xlim(-4, 4)
        plots.ylim(-4, 4)    

In [None]:
fig, ax = plots.subplots(2, 3, dpi=80, figsize=(16,9))
n = 500
r_scatter(0.2, n, ax[0,0])
r_scatter(0.5, n, ax[0,1])
r_scatter(0.8, n, ax[0,2])
r_scatter(-0.2, n, ax[1,0])
r_scatter(-0.5, n, ax[1,1])
r_scatter(-0.8, n, ax[1,2])
fig.tight_layout(pad=1)

---
<center>Return to Slides</center>

---

## Care when Interpreting the Correlation

When computing correlation it is important to always visualize your data first and then consider each of the following issues.


### Correlation does Not Imply Causation

We have covered this one extensively at this point.  

### Nonlinearity

Low correlation does not imply absence of a relationship. Correlation measures linear relationships.  Data with strong non-linear relationship may have very low correlation.  

In [None]:
new_x = np.arange(-4, 4.1, 0.5)
nonlinear = Table().with_columns(
        'x', new_x,
        'y', new_x**2
    )
nonlinear.scatter('x', 'y')

There is clearly a relationship to this data.  Given the value of $x$ you can easily predict the value of $y$.  What is the correlation?

In [None]:
correlation(nonlinear, 'x', 'y')

### Outliers

Outliers can have a significant effect on correlation.  

In [None]:
line = Table().with_columns(
        'x', make_array(1, 2, 3, 4),
        'y', make_array(1, 2, 3, 4)
    )
line.scatter('x', 'y')

In [None]:
correlation(line, 'x', 'y')

In [None]:
outlier = Table().with_columns(
        'x', make_array(1, 2, 3, 4, 5),
        'y', make_array(1, 2, 3, 4, 0)
    )
outlier.scatter('x', 'y')

In [None]:
correlation(outlier, 'x', 'y')

### Ecological Correlations

The correlation between aggregated variables (e.g., after grouping) may be much higher than the correlation between the underlying variables.

In [None]:
sat2014 = Table.read_table('data/sat2014.csv').sort('State')
sat2014

In [None]:
sat2014.scatter('Critical Reading', 'Math')

In [None]:
correlation(sat2014, 'Critical Reading', 'Math')

That is a very strong correlation.  However, each data point corresponds to a large cloud of data points where each person might have had greater variability in their scores.  

### Bonus: Understanding the SAT data
While we have the data loaded.  Does anyone have a guess which dots correspond to which state?

In [None]:
px.scatter(sat2014.to_df(), 
           x = "Critical Reading",
           y = "Math",
           hover_name = "State",
           size = "Participation Rate")

---
<center>Return to Slides</center>

---

## Prediction Lines

Here we build an intuition about the relationship between the slope of the nearest neighbor line and the correlation coefficient.

### Using the Heights Example

In [None]:
## Load the family height data
families = Table.read_table('data/family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns(
    'Parent Average', parent_avgs,
    'Child', families.column('child'),
).sort("Parent Average")

Here is a slightly more robust Nearest Neighbor predictor

In [None]:
def nn_heights(parent_average, window=0.5):
    lower_bound = parent_average - window
    upper_bound = parent_average + window
    similar_child_heights = (
        heights
            .where("Parent Average", are.between(lower_bound, upper_bound))
            .column("Child")
    )
    if len(similar_child_heights) == 0: #handle the case when there is no data
        return np.nan # nan = not a number , a special floating point "number"
    else:
        return np.mean(similar_child_heights)

Make predictions at many different parent heights not just the heights in the dataset.

In [None]:
test_heights = Table().with_column("Parent Average", np.arange(61,74,0.2))
test_heights = test_heights.with_column(
    "NN Prediction", test_heights.apply(nn_heights, "Parent Average"))

In [None]:
## Plot it all
fig = px.scatter(heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=test_heights.column("Parent Average"), 
                y=test_heights.column("NN Prediction"), name="NN Prediction")

However, it will be easier to start in standard units.

In [None]:
## Transform the heights data into standard units
su_heights = Table().with_columns(
    "Parent Average", standard_units(heights.column("Parent Average")),
    "Child", standard_units(heights.column("Child")))

## Transform the nearest neighbor predictions to standard units
su_test_heights = Table().with_columns(
    "Parent Average", 
    (test_heights.column("Parent Average") - heights.column("Parent Average").mean()) 
                        / heights.column("Parent Average").std(),
    "NN Prediction", 
    (test_heights.column("NN Prediction") - heights.column("Child").mean()) 
                        / heights.column("Child").std()) 

## Plot it all
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"), 
                y=su_test_heights.column("NN Prediction"), name="NN Prediction")

Computing the correlation we get:

In [None]:
correlation(heights, "Parent Average", "Child")

What happens if we draw a line with that slope:

In [None]:
r = correlation(su_heights, "Parent Average", "Child")
fig = px.scatter(su_heights.to_df(), x="Parent Average", y="Child", height=600)
fig.add_scatter(x=su_test_heights.column("Parent Average"), 
                y=su_test_heights.column("NN Prediction"), 
                name="NN Prediction")
fig.add_scatter(x=np.arange(-3,4,0.1), y= r * np.arange(-3,4,0.1), 
                name=f"Line(y={np.round(r,4)} x)")

## The Relationship Between Correlation and NN Predictions

Here we examine the relationship between the nearest neighbor prediction "line" and the correlation for several synthetic datasets.


In [82]:
def make_nn_predictions(table, x, y, window):
    
    def nn_prediction(x_val):        
        neighbors = table.where(x, are.between(x_val - window, x_val + window)).column(y)
        if len(neighbors) == 0:
            return np.nan
        else: 
            return np.mean(neighbors)   
        
    return table.apply(nn_prediction, x)

In [98]:
def draw_line(slope):
    plots.axline(xy1=(0,0), slope=slope)    

def draw_vline(x_pos):
    plots.axvline(x_pos, color='black')

**Todo**
- use `make_correlated_data` to create a table with data with r=0.99
- plot the data
- compute the nn_predictions and add the nn predictions as a column
- plot the nn predictions
- add a line with a slope of 1 

Now change r to lower values (e.g. 0.5). What is happening?
- draw a vertical line at x=2

## Now let's do this with the family data
- load the data
- compute the parent average
- convert to SU
- compute the nn predictions
- plot them
- compute the correlation
- add a line with slope=r

In [103]:
families = Table.read_table('data/family_heights.csv')
parent_avgs = (families.column('father') + families.column('mother'))/2
heights = Table().with_columns('Parent Average', parent_avgs, 'Child', families.column('child'))

---
<center>Return to Slides</center>

---


## Defining the linear regression line

In standard units we developed a simple equation for the regression line:

\begin{align}
\text{SU}(y_\text{predicted}) = r * \text{SU}(x_\text{new})
\end{align}

where $r$ is the correlation coefficient and $\text{SU}$ is the standard units:

\begin{align}
\text{SU}(y_\text{predicted}) & = \frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} \\
\text{SU}(x_\text{new}) &= \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}



Here we use $x_\text{new}$ to indicate a new $x$ value for which we want to make a prediction  $y_\text{predicted}$.

We would like to express this line in the original units of the data.  We can do that by substituting the definition of standard units:

\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} = r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}
\end{align}

While this equation does desribe a line it would look a little nicer in the form:

\begin{align}
y_\text{predicted} = \text{slope} * x_\text{new}  + \text{intercept}
\end{align}

Let's do some algebra to get that equation:
$$
\require{color}
\definecolor{comment}{RGB}{200,100,50}
\begin{align}
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r *  \frac{x_\text{new} - \text{Mean}(x)}{\text{Stdev}(x)}\\
\frac{y_\text{predicted} - \text{Mean}(y)}{\text{Stdev}(y)} &= r * \frac{1}{\text{Stdev}(x)} x_\text{new} - r * \frac{1}{\text{Stdev}(x)}\text{Mean}(x)  & \color{comment} \text{Expanding the right side}\\
y_\text{predicted} - \text{Mean}(y) &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Multiplying by $\text{Stdev}(y)$}\\
y_\text{predicted} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)} x_\text{new} + \text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x) &  \color{comment} \text{Adding $\text{Mean}(y)$}\\
y_\text{predicted} &= \left(r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\right) x_\text{new} + \left(\text{Mean}(y) - r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\text{Mean}(x)\right) &  \color{comment} \text{Rearranging Terms}
\end{align}
$$

This means we can define the slope and intercept as:
\begin{align}
\text{slope} &= r * \frac{\text{Stdev}(y)}{\text{Stdev}(x)}\\
\text{intercept} & = \text{Mean}(y) - \text{slope} * \text{Mean}(x)
\end{align}

## Implementing Linear Regression

Using the above equations implement the slope and intercept functions:

In [None]:
def slope(t, x, y):
    """Computes the slope of the regression line"""
    ...

In [None]:
def intercept(t, x, y):
    """Computes the intercept of the regression line"""
    ...

Testing it out

In [None]:
example = make_correlated_data(0.5)
slope(example, 'x', 'y')

Computing the slope and intercept for the heights dataset:

In [None]:
heights_slope = ...
heights_intercept = ...
[heights_slope, heights_intercept]

<details>
    
```python
heights_slope = slope(heights, 'Parent Average', 'Child')
heights_intercept = intercept(heights, 'Parent Average', 'Child')
[heights_slope, heights_intercept]
```
</details>

Adding the regression predictions:

In [None]:
heights = heights.with_column(
    'Regression Prediction', 
    ...
)
heights

<details>
    
```python
heights = heights.with_column(
    'Regression Prediction', 
    predicted_heights_slope*heights.column('Parent Average') + predicted_heights_intercept
)
heights
```

</details>

In [None]:
heights.scatter('Parent Average')
#draw_line(heights_intercept, heights_slope)
plots.xlim(60,75)
plots.ylim(55,80)

---
<center>Return to Slides</center>

---

## Discussion Question: Exam Score Prediction

In [None]:
# X-axis: midterm scores
midterm_mean = 70
midterm_sd = 10

# Y-axis: final scores
final_mean = 50
final_sd = 12

# Correlation (relates X to Y values)
corr = 0.75

# X value
midterm_student = 90

In [None]:
midterm_student_su = (midterm_student - midterm_mean) / midterm_sd
midterm_student_su

In [None]:
final_student_su = midterm_student_su * corr
final_student_su

In [None]:
final_student = final_student_su * final_sd + final_mean
final_student