In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import linear_model
sns.set_style("whitegrid")

In [None]:
%matplotlib inline

In [None]:
summary = pd.read_csv("atussum_2013.dat")
summary.info()

def activity_by_age(df, activity_code, subsample=True):
    activity_col = "t{}".format(activity_code)
    df = df[['TUFINLWGT', 'TEAGE', activity_col]]
    df = df.rename(columns={"TUFINLWGT": "weight", "TEAGE": "age", activity_col: "minutes"})
    if subsample:
        df = df[df.age % 5 == 0]
    df['weighted_minutes'] = df.weight * df.minutes
    df = df.groupby("age").sum()
    df['mean_minutes'] = df.weighted_minutes / df.weight
    df = df[['mean_minutes']]
    return df

def linear_least_squares(df, fn):
    values = df.index.map(fn)
    diffs = df.mean_minutes - values
    diffs_squared = diffs ** 2
    return diffs_squared.sum() / (2 * len(diffs)) 

In [None]:
sleep_data = summary[['TUFINLWGT', 'TEAGE', 'TESEX', 't010101']]
sleep_data = sleep_data.rename(columns={"TUFINLWGT": "weight", "TEAGE": "age", "TESEX": "sex", "t010101": "minutes"})
sleep_data['weighted_minutes'] = sleep_data.weight * sleep_data.minutes
sleep_data.info()

In [None]:
sleep_by_age = sleep_data.groupby("age").sum()
sleep_by_age.head()

In [None]:
sleep_by_age['mean_minutes'] = sleep_by_age.weighted_minutes / sleep_by_age.weight
sleep_by_age.head()

In [None]:
sleep_by_age.loc[38].mean_minutes / 60

In [None]:
plt.scatter(sleep_by_age.index, sleep_by_age.mean_minutes)

In [None]:
regression = linear_model.LinearRegression()
transposed_sleep_values = np.array(sleep_by_age.index.values).reshape((-1, 1))
regression.fit(transposed_sleep_values, sleep_by_age.mean_minutes.values)
print(regression.coef_, regression.intercept_)

In [None]:
def plot_regression(fn):
    plt.scatter(sleep_by_age.index, sleep_by_age.mean_minutes)
    xmin, xmax = plt.xlim()
    xs = np.linspace(xmin, xmax, 100)
    plt.plot(xs, [fn(x) for x in xs])

In [None]:
fn = lambda x: regression.predict(x)[0]

plot_regression(fn)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', LinearRegression(fit_intercept=False))])
model = model.fit(np.array(sleep_by_age.index.values).reshape((-1, 1)), 
                  sleep_by_age.mean_minutes.values)

print(model.named_steps['linear'].coef_)

In [None]:
fn = lambda x: model.predict(x)[0]
plot_regression(fn)

In [None]:
model.predict(38) / 60

In [None]:
model.predict(50) / 60

In [None]:
model = Pipeline([('poly', PolynomialFeatures(degree=0)),
                  ('linear', LinearRegression(fit_intercept=False))])
model = model.fit(np.array(sleep_by_age.index.values).reshape((-1, 1)), 
                  sleep_by_age.mean_minutes.values)


print(model.named_steps['linear'].coef_)

fn = lambda x: model.predict(x)[0]

plot_regression(fn)

In [None]:
model = Pipeline([('poly', PolynomialFeatures(degree=1)),
                  ('linear', LinearRegression(fit_intercept=False))])
model = model.fit(np.array(sleep_by_age.index.values).reshape((-1, 1)), 
                  sleep_by_age.mean_minutes.values)

print(model.named_steps['linear'].coef_)

fn = lambda x: model.predict(x)[0]

plot_regression(fn)

In [None]:

from sklearn.linear_model import Ridge
model = Pipeline([('poly', PolynomialFeatures(degree=5)),
                  ('linear', Ridge())])
model = model.fit(np.array(sleep_by_age.index.values).reshape((-1, 1)), 
                  sleep_by_age.mean_minutes.values)

print(model.named_steps['linear'].coef_)
fn = lambda x: model.predict(x)[0]

plot_regression(fn)