In [2]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [3]:
full = pd.read_csv('full.csv')

In [4]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []

    seasons = sorted(data['season'].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = data[data['season'] < season]
        test = data[data['season'] == season]

        model.fit(train[predictors], train['target'])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test['target'], preds], axis=1)
        combined.columns = ['actual', 'prediction']

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [5]:
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction='forward', cv=split)

In [6]:
removed_columns = list(full.columns[full.dtypes == 'object']) + ['season', 'date', 'won', 'target', 'team', 'team_opp']
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [7]:
sfs.fit(full[selected_columns], full['target'])

In [8]:
predictors = list(selected_columns[sfs.get_support()])

predictions = backtest(full, rr, predictors)
predictions = predictions[predictions['actual'] != 2]

accuracy_score(predictions['actual'], predictions['prediction'])

0.6329009807023094