In [9]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

In [10]:
pd.set_option('display.expand_frame_repr', False)
np.random.seed(42)

DATA_DIR = Path('/Volumes/My Book/Data_Analysis')

In [11]:
def get_backtest_data(predictions='lasso/predictions'):
    """Combine chapter 7 lr/lasso/ridge regression predictions
        with adjusted OHLCV Quandl Wiki data"""
    with pd.HDFStore(DATA_DIR / 'Nasdaq/assets.h5') as store:
        prices = (store['quandl/wiki/prices']
                  .filter(like='adj')
                  .rename(columns=lambda x: x.replace('adj_', ''))
                  .swaplevel(axis=0))

    with pd.HDFStore(DATA_DIR / 'Data/data.h5') as store:
        print(store.info())
        predictions = store[predictions]

    best_alpha = predictions.groupby('alpha').apply(lambda x: spearmanr(x.actuals, x.predicted)[0]).idxmax()
    predictions = predictions[predictions.alpha == best_alpha]
    predictions.index.names = ['ticker', 'date']
    tickers = predictions.index.get_level_values('ticker').unique()
    start = predictions.index.get_level_values('date').min().strftime('%Y-%m-%d')
    stop = (predictions.index.get_level_values('date').max() + pd.DateOffset(1)).strftime('%Y-%m-%d')
    idx = pd.IndexSlice
    prices = prices.sort_index().loc[idx[tickers, start:stop], :]
    predictions = predictions.loc[predictions.alpha == best_alpha, ['predicted']]
    return predictions.join(prices, how='right')

In [12]:
df = get_backtest_data('lasso/predictions')
print(df.info())
df.to_hdf(DATA_DIR / 'Data/backtest.h5', 'data')

<class 'pandas.io.pytables.HDFStore'>
File path: /Volumes/My Book/Data_Analysis/Data/data.h5
/lasso/coeffs                    frame        (shape->[8,33])      
/lasso/predictions               frame        (shape->[592240,3])  
/lasso/scores                    frame        (shape->[6000,3])    
/logistic/coeffs                 frame        (shape->[11,33])     
/logistic/predictions            frame        (shape->[814330,4])  
/logistic/scores                 frame        (shape->[825,5])     
/lr/predictions                  frame        (shape->[74030,2])   
/lr/scores                       frame        (shape->[750,2])     
/model_data                      frame        (shape->[2168898,69])
/ridge/coeffs                    frame        (shape->[18,33])     
/ridge/predictions               frame        (shape->[1332540,3]) 
/ridge/scores                    frame        (shape->[13500,3])   
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 185483 entries, ('AAPL', Timestamp('2014-