In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('clean_auto_data.csv')

In [7]:
df[:3]
df.shape

(392, 7)

In [5]:
df['cylinders'] = df['cylinders'].astype('category')

In [6]:
df['model_year'] = df['model_year'].astype('category')

In [9]:
from sklearn.cross_validation import KFold
kf = KFold(len(df), n_folds=10, shuffle=True, random_state=43)
kf

sklearn.cross_validation.KFold(n=392, n_folds=10, shuffle=True, random_state=43)

In [10]:
[i for i in kf]

[(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  13,
          14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  28,
          29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
          42,  43,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  56,
          57,  58,  59,  60,  61,  63,  65,  66,  67,  68,  69,  70,  71,
          72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  83,  84,  85,
          86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
          99, 100, 101, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
         113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 124, 127, 128,
         129, 131, 132, 133, 134, 135, 137, 139, 140, 141, 144, 145, 146,
         148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
         161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
         175, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
         190, 191, 192, 194, 195, 196,

In [12]:
import patsy
import statsmodels.api as sm
from sklearn.metrics import r2_score

In [13]:

Rsq_10xCV = []

model_def = 'mpg ~ horsepower'

for train_index, test_index in kf:
    train_df = df.iloc[train_index]
    y_train, X_train = patsy.dmatrices(model_def, 
                                       data=train_df, return_type='dataframe')
    model = sm.OLS(y_train, X_train)
    results = model.fit()
    test_df = df.iloc[test_index]
    y_test, X_test = patsy.dmatrices(model_def, 
                                     data=test_df, return_type='dataframe')
    y_predicted = results.predict(X_test)
    Rsq_10xCV.append(r2_score(y_predicted,y_test))





In [15]:
Rsq_10xCV

[0.45534236184183996,
 -0.11285295053836286,
 0.51428650828028077,
 0.46766662364597777,
 0.54881652797734459,
 -0.0061237172613026747,
 0.3893152910609019,
 0.13195843736761581,
 0.23458766863348424,
 -0.034626769658584911]

In [14]:
np.mean(Rsq_10xCV)

0.25883699813491939

In [66]:
# 0.39
# 0.43
# 0.53
# 0.78
# 0.81

In [21]:

outof_sample = []
in_sample = []

model_def = 'mpg ~ horsepower'
alphas = [0.2,0.1,-0.1,-0.2,-0.3,-0.4,-0.5,-0.6,-0.7,-1,-2,-4,-6,-9]

for al in alphas:
    outof_sample = []
    in_sample = []
    for train_index, test_index in kf:
        train_df = df.iloc[train_index]
        y_train, X_train = patsy.dmatrices(model_def, 
                                           data=train_df, return_type='dataframe')
        model = sm.OLS(y_train, X_train)
        results = model.fit_regularized(alpha=al)
        test_df = df.iloc[test_index]
        in_sample.append(r2_score( results.predict(X_train) ,y_train))
        y_test, X_test = patsy.dmatrices(model_def, 
                                         data=test_df, return_type='dataframe')
        y_predicted = results.predict(X_test)
        outof_sample.append(r2_score(y_predicted,y_test))
    print 'alpha',al ,'R^2 outof_sample',np.mean(outof_sample),'R^2 in sample', np.mean(in_sample)




alpha 0.2 R^2 outof_sample 0.0818667741443 R^2 in sample 0.202578121597
alpha 0.1 R^2 outof_sample 0.179700639037 R^2 in sample 0.284060499872
alpha -0.1 R^2 outof_sample 0.322937584136 R^2 in sample 0.402726896436
alpha -0.2 R^2 outof_sample 0.374858400414 R^2 in sample 0.44543139265
alpha -0.3 R^2 outof_sample 0.416848401772 R^2 in sample 0.479761413349
alpha -0.4 R^2 outof_sample 0.450694042096 R^2 in sample 0.507230025403
alpha -0.5 R^2 outof_sample 0.477825715592 R^2 in sample 0.529048529285
alpha -0.6 R^2 outof_sample 0.49939708919 R^2 in sample 0.546193652985
alpha -0.7 R^2 outof_sample 0.516344871176 R^2 in sample 0.559458194825
alpha -1 R^2 outof_sample 0.546443302259 R^2 in sample 0.581886625725
alpha -2 R^2 outof_sample 0.536505058605 R^2 in sample 0.56518123571
alpha -4 R^2 outof_sample 0.410311022606 R^2 in sample 0.447672809047
alpha -6 R^2 outof_sample 0.300016187918 R^2 in sample 0.347890955952
alpha -9 R^2 outof_sample 0.184904149967 R^2 in sample 0.244589905114


In [23]:
np.mean(outof_sample)

0.18490414996732341

In [36]:

outof_sample_means = []
in_sample = []

model_def = 'mpg ~ horsepower'
# alphas = [0.2,0.1,-0.1,-0.2,-0.3,-0.4,-0.5,-0.6,-0.7,-1,-2,-4,-6,-9]
alphas = [-0.0001,-0.001,-0.01,-0.1,-1,-10,-100,-1000,-10000]

for al in alphas:
    outof_sample = []
    in_sample = []
    for train_index, test_index in kf:
        train_df = df.iloc[train_index]
        y_train, X_train = patsy.dmatrices(model_def, 
                                           data=train_df, return_type='dataframe')
        model = sm.OLS(y_train, X_train)
        results = model.fit_regularized(alpha=al)
        test_df = df.iloc[test_index]
        in_sample.append(r2_score( results.predict(X_train) ,y_train))
        y_test, X_test = patsy.dmatrices(model_def, 
                                         data=test_df, return_type='dataframe')
        y_predicted = results.predict(X_test)
        outof_sample.append(r2_score(y_predicted,y_test))
    outof_sample_means.append(np.mean(outof_sample))
    print 'alpha',al ,'R^2 outof_sample',np.mean(outof_sample),'R^2 in sample', np.mean(in_sample)

alpha -0.0001 R^2 outof_sample 0.258908067973 R^2 in sample 0.349801846402
alpha -0.001 R^2 outof_sample 0.25954707497 R^2 in sample 0.350331138236
alpha -0.01 R^2 outof_sample 0.265870980347 R^2 in sample 0.355568178784
alpha -0.1 R^2 outof_sample 0.322937584136 R^2 in sample 0.402726896436
alpha -1 R^2 outof_sample 0.546443302259 R^2 in sample 0.581886625725
alpha -10 R^2 outof_sample 0.156185155876 R^2 in sample 0.218897791419
alpha -100 R^2 outof_sample -0.184561197167 R^2 in sample -0.0846020852136
alpha -1000 R^2 outof_sample -0.233174282263 R^2 in sample -0.127771729603
alpha -10000 R^2 outof_sample -0.238228692434 R^2 in sample -0.132258909902


In [37]:
np.max(outof_sample_means)

0.5464433022588906

In [38]:
# nate says that normalization was not done
# fix to max
np.max(outof_sample)

-0.0056847625191764894

In [35]:
outof_sample

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]