In [None]:
# HIDDEN
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Lecture 17: Least Squares

In [None]:
sample = [[131, 14431], [231, 20558], [392, 40935], [157, 23524]]
def lw_errors(slope, intercept):
    little_women.scatter('Periods', 'Characters')
    xlims = np.array([50, 450])
    plots.plot(xlims, slope * xlims + intercept, lw=2)
    for x, y in sample:
        plots.plot([x, x], [y, slope * x + intercept], color='r', lw=2)

### Discussion Question ###

Midterm: Average 70, SD 10
Final:   Average 50, SD 12
r = 0.75

a) Estimate the average final exam score for those who got 90 on the midterm.

b) Estimate the average final exam score for those who got 60 on the midterm.

### Regression Line ###

In [None]:
def standard_units(w):
    return (w - np.average(w))/np.std(w)  

# t is a table; x and y are column labels

def correlation(t, x, y):
    return np.average(standard_units(t.column(x))*standard_units(t.column(y)))

def slope(t, x, y):
    r = correlation(t, x, y)
    return r * np.std(t.column(y))/np.std(t.column(x))

def intercept(t, x, y):
    a = slope(t, x, y)
    return np.average(t.column(y)) - a * np.average(t.column(x))

In [None]:
def fitted_values(t, x, y):
    """Return an array of the regressions estimates at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
little_women = Table.read_table('little_women.csv')
little_women = little_women.move_to_start('Periods')
little_women

In [None]:
little_women.scatter('Periods', 'Characters')

In [None]:
correlation(little_women, 'Periods', 'Characters')

In [None]:
predicted = fitted_values(little_women, 'Periods', 'Characters')

In [None]:
lw_with_predictions = little_women.with_column('Linear Prediction', predicted)
lw_with_predictions.scatter('Periods')

In [None]:
actual = little_women.column('Characters')
errors = actual - predicted

In [None]:
lw_with_predictions.with_column('Error', errors)

In [None]:
np.mean(errors ** 2) ** 0.5

In [None]:
lw_reg_slope = slope(little_women, 'Periods', 'Characters')
lw_reg_intercept = intercept(little_women, 'Periods', 'Characters')

In [None]:
lw_errors(lw_reg_slope, lw_reg_intercept)

In [None]:
# takes any slope, any intercept

lw_errors(50, 10000)

In [None]:
lw_errors(-100, 50000)

### Root Mean Square Error ###

In [None]:
def lw_rmse(slope, intercept):
    lw_errors(slope, intercept)
    x = little_women.column('Periods')
    y = little_women.column('Characters')
    prediction = slope * x + intercept
    mse = np.mean((y - prediction) ** 2)
    print("Root mean squared error:", mse ** 0.5)

In [None]:
lw_rmse(50, 10000)

In [None]:
lw_rmse(-100, 50000)

In [None]:
lw_rmse(90, 4000)

In [None]:
lw_rmse(lw_reg_slope, lw_reg_intercept)

### Numerical Optimization ###

In [None]:
x = np.arange(1, 3, 0.1)
y = (x-2)**2 + 3
Table().with_column('x', x,
                   'y', y).plot('x')

In [None]:
def f(x):
    return ((x-2)**2) + 3

In [None]:
minimize(f)

### Minimizing RMSE ###

In [None]:
def lw_rmse(any_slope, any_intercept):
    x = little_women.column('Periods')
    y = little_women.column('Characters')
    estimate = any_slope*x + any_intercept
    return (np.mean((y - estimate) ** 2)) ** 0.5

In [None]:
lw_rmse(50, 10000)

In [None]:
lw_rmse(-100, 50000)

In [None]:
minimize(lw_rmse)

In [None]:
lw_reg_slope, lw_reg_intercept

### Nonlinear Regression ###

In [None]:
shotput = Table.read_table('shotput.csv')

In [None]:
shotput

In [None]:
shotput.scatter('Weight Lifted')

In [None]:
def shotput_linear_rmse(any_slope, any_intercept):
    x = shotput.column('Weight Lifted')
    y = shotput.column('Shot Put Distance')
    estimate = any_slope*x + any_intercept
    return np.mean((y - estimate) ** 2) ** 0.5

In [None]:
best_line = minimize(shotput_linear_rmse)
best_line

In [None]:
weights = shotput.column(0)

In [None]:
linear_fit = best_line.item(0)*weights + best_line.item(1)

shotput.with_column(
    'Best Line', linear_fit
).scatter(0)

**Quadratic Function**

$$
f(x) ~=~ ax^2 + bx + c
$$
for constants $a$, $b$, and $c$.



In [None]:
def shotput_quadratic_rmse(a, b, c):
    x = shotput.column('Weight Lifted')
    y = shotput.column('Shot Put Distance')
    estimate = a*(x**2) + b*x + c
    return np.mean((y - estimate) ** 2) ** 0.5

In [None]:
best_quad = minimize(shotput_quadratic_rmse)
best_quad

In [None]:
# x = weight lifted = 100 kg
# Then predicted shot put distance:

(-0.00104)*(100**2) + 0.2827*100 - 1.5318

In [None]:
quad_fit = best_quad.item(0)*(weights**2) + best_quad.item(1)*weights + best_quad.item(2)

In [None]:
shotput.with_column('Best Quadratic Curve', quad_fit).scatter(0)