In [1]:
from collections import Counter
import matplotlib.pyplot as plt
import random
import math
import numpy as np
import gradient_descent as gd

In [2]:
def mean(x):
    return sum(x) / len(x)
def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]
def dot(v, w):
    return sum(v_i * w_i
        for v_i, w_i in zip(v, w))
def sum_of_squares(x):
    return sum([x_i * x_i for x_i in x])
def variance(x):
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)
def standard_deviation(x):
    return math.sqrt(variance(x))
def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / (n - 1)
def correlation(x, y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(x, y) / stdev_x / stdev_y
    else:
        return

In [3]:
num_friends = [10, 49, 41, 40, 25, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8, 10, 11, 11, 12, 12,
15, 15, 15, 18, 20, 20]
daily_min = [18, 39, 37, 35, 28, 7, 9, 8, 7, 8, 10, 11, 12, 9, 13, 15, 14, 14, 17, 16, 15, 19, 16, 21, 17, 19, 22, 18, 21, 20, 24, 23, 22, 25,
25, 27, 29, 28, 30, 32]
work_hours = [8, 6, 6, 6, 6, 9, 10, 10, 12, 11, 10, 9, 9, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 7, 7, 7, 7, 7, 7,
7, 6, 6, 6, 6, 6]
has_phd = [0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]

In [4]:
def predict(x_i, beta):
    """assumes that the first element of each x_i is 1"""
    return dot(x_i, beta)

In [5]:
[1, # constant term
49, # number of friends
4, # work hours per day
0] # doesn't have PhD

[1, 49, 4, 0]

In [6]:
x = [[1, x1, x2, x3] for x1, x2, x3 in 
     zip(num_friends, work_hours, has_phd)]

In [7]:
x[:3]

[[1, 10, 8, 0], [1, 49, 6, 0], [1, 41, 6, 1]]

In [8]:
def error(x_i, y_i, beta):
    return y_i - predict(x_i, beta)
def squared_error(x_i, y_i, beta):
    return error(x_i, y_i, beta) ** 2
def squared_error_gradient(x_i, y_i, beta):
    """the gradient (with respect to beta) corresponding to the ith squared error term"""
    return [-2 * x_ij * error(x_i, y_i, beta) for x_ij in x_i]


In [9]:
def estimate_beta(x, y):
    beta_initial = [random.random() for x_i in x[0]]
    return gd.minimize_stochastic(squared_error,
                            squared_error_gradient,
                            x, y,
                            beta_initial,
                            0.001)

In [10]:
random.seed(0)
beta = estimate_beta(x, daily_min)

In [11]:
beta

[0.8444218515250481,
 0.7579544029403025,
 0.420571580830845,
 0.25891675029296335]

In [12]:
def total_sum_of_squares(y):
    """the total squared variation of y_i's from their mean"""
    return sum(v ** 2 for v in de_mean(y))
def multiple_r_squared(x, y, beta):
    sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2
                                for x_i, y_i in zip(x, y))
    return 1.0 - (sum_of_squared_errors / total_sum_of_squares(y))

In [13]:
print("R2 = ", multiple_r_squared(x, daily_min, beta))

R2 =  0.001218592868417634
