**analysis_DST file extracts useful features for teams and finds optimal parameters for regularizations (similar to analysis_QBWRRBTE)**

In [None]:
from __future__ import print_function

In [3]:
# all of the imports
import pandas as pd
import numpy as np
import pickle 
import patsy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
% matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import cross_val_score
from sklearn import feature_selection as f_select
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [4]:
pos = 'DST'
df = pd.read_csv(pos+'_adj2.csv')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

features = dict()
lasso = dict()
ridge = dict()

In [25]:
df = df.sort_values(['year', 'week'])
y = df['Fantasy Points'].reset_index(drop=True)
X = df.drop(['Fantasy Points', 'pos', 'year', 'week', 'player'], axis=1).reset_index(drop=True)

df_train = df[df['year'] <= 2015]
df_test = df[df['year'] > 2015]

# feature selection
est = LinearRegression()

kf = KFold(n=len(X), n_folds=5, shuffle=True)
all_scores = []

# Instead of using cross_val_score, let's use sklearn.cross_validation.KFold; this will allow
# us to 'manipulate' our training set, 

# get indices of corresponding train & test
for train,test in kf:
    train.sort()
    test.sort()
    x_train=X.iloc[train]
    y_train=y.iloc[train]
    x_test=X.iloc[test]
    y_test=y.iloc[test]
    pvals=[]
    sig_cols=[]

    for feature in x_train.columns:
        pval=f_select.f_regression(x_train[[feature]],y_train)
        if pval[1][0]<.02: 
            sig_cols.append(feature)
            pvals.append(pval[1][0])

    est.fit(x_train[sig_cols],y_train)
    r_2=est.score(x_test[sig_cols],y_test)
    all_scores.append(r_2)

features[pos] = sig_cols

y_train = df_train['Fantasy Points']
X_train = df_train[sig_cols]

y_test = df_test['Fantasy Points']
X_test = df_test[sig_cols]


# LASSO

## Ridge Regularization
degrees = [1, 2, 3, 4, 5]
alphas = [1e-6, 1e-3, 1, 1e6]
len_degree = len(degrees)
len_alpha = len(alphas)

mse = []
for degree in degrees:
    for alpha_ in alphas:
        est = make_pipeline(PolynomialFeatures(degree), Lasso(alpha=alpha_));
        est.fit(X_train, y_train);
        mse.append(mean_squared_error(y_train, est.predict(X_train)))


max_index = mse.index(max(mse))
max_index2 = [max_index // len_alpha, max_index % len_alpha]
max_degree_lasso = degrees[max_index2[0]]
max_alpha_lasso = alphas[max_index2[1]]

lasso[pos] = [max_degree_lasso, max_alpha_lasso, mse]



# Ridge

## Ridge Regularization
degrees = [1, 2, 3, 4, 5, 6]
alphas = [1e-6, 1e-3, 1, 1e3, 1e6]
len_degree = len(degrees)
len_alpha = len(alphas)

mse = []
for degree in degrees:
    for alpha_ in alphas:
        est = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha_));
        est.fit(X_train, y_train);
        mse.append(mean_squared_error(y_train, est.predict(X_train)))


max_index = mse.index(max(mse))
max_index2 = [max_index // len_alpha, max_index % len_alpha]
max_degree_ridge = degrees[max_index2[0]]
max_alpha_ridge = alphas[max_index2[1]]

ridge[pos] = [max_degree_ridge, max_alpha_ridge, mse]

In [4]:
len(features['DST'])

12