Some pipeline augmentation tricks from https://www.quantopian.com/posts/building-the-foundations-for-hypothesis-testing

In [1]:
import numpy as np
import pandas as pd

from quantopian.pipeline import CustomFactor, Pipeline
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import AverageDollarVolume
from quantopian.pipeline.factors import Returns
from quantopian.pipeline.filters.morningstar import Q500US
from quantopian.pipeline.classifiers.morningstar import Sector

from quantopian.pipeline.data.zacks import EarningsSurprises
from quantopian.pipeline.factors.eventvestor import (
BusinessDaysUntilNextEarnings,
BusinessDaysSincePreviousEarnings
)
from quantopian.pipeline.data.eventvestor import EarningsCalendar

from datetime import timedelta

import bisect

Split so we don't go over memory limits, doesn't affect the final output

In [2]:
# Split so we don't go over memory limits
def split_run_pipeline(pipeline, start, end, segments):
    dts = np.arange(start, end, (end - start) / segments)
    if len(dts) == segments:
        dts = np.append(dts, end)
    return pd.concat(map(
        lambda i: run_pipeline(pipeline, dts[i], dts[i + 1] - pd.Timedelta('1d')),
        range(segments)
    ))

In [3]:
def create_positions_pipeline():
    # filters
    has_earnings_announcement = BusinessDaysUntilNextEarnings().eq(1)
    
    # factors
    price = USEquityPricing.open.latest
    lag_ea_surp = EarningsSurprises.eps_pct_diff_surp.latest
    dollar_volume = AverageDollarVolume(window_length=30)
    sector = Sector()

    return Pipeline(columns= {
        'LagEaSurp': lag_ea_surp,
        'Current Price': price, 
        'Earnings Announcement': EarningsCalendar.next_announcement.latest,
        'Average Dollar Volume': dollar_volume,
        'Sector': sector}, 
                    screen=has_earnings_announcement & Q500US())

In [4]:
START = pd.Timestamp('01-01-2013')
END = pd.Timestamp('02-01-2013')
SPLITS = 2

In [5]:
positions_data = split_run_pipeline(create_positions_pipeline(), START, END, SPLITS)
positions_data.dropna(inplace=True)

In [6]:
positions_data.head()

Unnamed: 0,Unnamed: 1,Average Dollar Volume,Current Price,Earnings Announcement,LagEaSurp,Sector
2013-01-03 00:00:00+00:00,Equity(41462 [MOS]),131366200.0,58.06,2013-01-04,-12.17,101
2013-01-07 00:00:00+00:00,Equity(22140 [MON]),183930600.0,95.56,2013-01-08,0.0,101
2013-01-08 00:00:00+00:00,Equity(24829 [APOL]),38902500.0,22.04,2013-01-08,6.12,205
2013-01-08 00:00:00+00:00,Equity(24873 [STZ]),41649290.0,36.52,2013-01-09,31.48,205
2013-01-10 00:00:00+00:00,Equity(8151 [WFC]),709165700.0,34.847,2013-01-11,1.15,103


create another pipeline with 3 days backward returns, notice the definition of "had_earnings"

In [7]:
def create_returns_pipeline():
    three_day_returns = Returns(mask=Q500US(), inputs=[USEquityPricing.open], window_length=3)
    had_earnings = BusinessDaysSincePreviousEarnings().eq(2)
    return Pipeline(columns={
        'Returns 2 days after': three_day_returns,
        'Earnings Announcement': EarningsCalendar.previous_announcement.latest
                    },
                    screen=had_earnings & Q500US())

In [8]:
returns_data = split_run_pipeline(create_returns_pipeline(), START, END, SPLITS)
returns_data.head(3)

Unnamed: 0,Unnamed: 1,Earnings Announcement,Returns 2 days after
2013-01-07 00:00:00+00:00,Equity(2760 [FDO]),2013-01-03,-0.123567
2013-01-08 00:00:00+00:00,Equity(41462 [MOS]),2013-01-04,0.010936
2013-01-10 00:00:00+00:00,Equity(22140 [MON]),2013-01-08,0.024255


adjust the pipeline's columns

In [9]:
positions_data.reset_index(inplace=True)
positions_data.rename(columns= {
                 'level_0': 'Current Day',
                 'level_1': 'Equity'
                 },
                 inplace=True)
positions_data.head()

Unnamed: 0,Current Day,Equity,Average Dollar Volume,Current Price,Earnings Announcement,LagEaSurp,Sector
0,2013-01-03 00:00:00+00:00,Equity(41462 [MOS]),131366200.0,58.06,2013-01-04,-12.17,101
1,2013-01-07 00:00:00+00:00,Equity(22140 [MON]),183930600.0,95.56,2013-01-08,0.0,101
2,2013-01-08 00:00:00+00:00,Equity(24829 [APOL]),38902500.0,22.04,2013-01-08,6.12,205
3,2013-01-08 00:00:00+00:00,Equity(24873 [STZ]),41649290.0,36.52,2013-01-09,31.48,205
4,2013-01-10 00:00:00+00:00,Equity(8151 [WFC]),709165700.0,34.847,2013-01-11,1.15,103


In [10]:
positions_data.set_index(['Earnings Announcement', 'Equity'], inplace=True)
positions_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Current Day,Average Dollar Volume,Current Price,LagEaSurp,Sector
Earnings Announcement,Equity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-04,Equity(41462 [MOS]),2013-01-03 00:00:00+00:00,131366200.0,58.06,-12.17,101
2013-01-08,Equity(22140 [MON]),2013-01-07 00:00:00+00:00,183930600.0,95.56,0.0,101
2013-01-08,Equity(24829 [APOL]),2013-01-08 00:00:00+00:00,38902500.0,22.04,6.12,205
2013-01-09,Equity(24873 [STZ]),2013-01-08 00:00:00+00:00,41649290.0,36.52,31.48,205
2013-01-11,Equity(8151 [WFC]),2013-01-10 00:00:00+00:00,709165700.0,34.847,1.15,103


In [11]:
returns_data.reset_index(inplace=True)
# We don't care about the date used as the previous index
del returns_data['level_0']
returns_data.rename(columns ={'level_1': 'Equity'}, inplace=True)
returns_data.set_index(['Earnings Announcement', 'Equity'], inplace=True)
returns_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Returns 2 days after
Earnings Announcement,Equity,Unnamed: 2_level_1
2013-01-03,Equity(2760 [FDO]),-0.123567
2013-01-04,Equity(41462 [MOS]),0.010936
2013-01-08,Equity(22140 [MON]),0.024255
2013-01-08,Equity(2 [ARNC]),-0.004367
2013-01-08,Equity(24829 [APOL]),-0.09555


Now combining these two data frames we have a pipeline with 2 days look-ahead return

In [12]:
stock_data = returns_data.join(positions_data)
stock_data.dropna(inplace=True)
stock_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Returns 2 days after,Current Day,Average Dollar Volume,Current Price,LagEaSurp,Sector
Earnings Announcement,Equity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-04,Equity(41462 [MOS]),0.010936,2013-01-03 00:00:00+00:00,131366200.0,58.06,-12.17,101.0
2013-01-08,Equity(22140 [MON]),0.024255,2013-01-07 00:00:00+00:00,183930600.0,95.56,0.0,101.0
2013-01-08,Equity(24829 [APOL]),-0.09555,2013-01-08 00:00:00+00:00,38902500.0,22.04,6.12,205.0
2013-01-09,Equity(24873 [STZ]),-0.001649,2013-01-08 00:00:00+00:00,41649290.0,36.52,31.48,205.0
2013-01-11,Equity(8151 [WFC]),-0.001142,2013-01-10 00:00:00+00:00,709165700.0,34.847,1.15,103.0


In [13]:
def create_deciles_pipeline():    
    mask = Q500US()
    lag_e_surp = EarningsSurprises.eps_pct_diff_surp.latest
    decile = lag_e_surp.deciles(mask=mask)
    date = EarningsSurprises.asof_date.latest
    
    return Pipeline(columns= {
        'LagESurp Decile': decile,
        'LagESurp': lag_e_surp,
        'Earnings Announcement Date' : date
                    },
                    screen=mask)

Run the pipeline at the begining of each quarter

In [14]:
def run_pipeline_freq(start, end, pipeline):
    '''
    Runs a pipeline given a pandas datetime frequency like "Q" 
    '''
    quarters = pd.date_range(start, end, freq='QS')
    quarters = quarters.tolist()
    if start not in quarters:
        start = start - pd.tseries.offsets.QuarterOffset()
        quarters.insert(0, start)
        
    output = None
    return pd.concat(map(
        lambda i: run_pipeline(pipeline, quarters[i],quarters[i]),
        range(len(quarters))
    ))

The output of this pipeline will be the decile of 'LagESurp' factor measured in the whole quarter (the default will only compute decile in a day)

In [15]:
decile_data = run_pipeline_freq(START, END, create_deciles_pipeline())
decile_data = decile_data.dropna()
decile_data.head()

Unnamed: 0,Unnamed: 1,Earnings Announcement Date,LagESurp,LagESurp Decile
2013-01-02 00:00:00+00:00,Equity(24 [AAPL]),2012-10-26,-2.03,2
2013-01-02 00:00:00+00:00,Equity(67 [ADSK]),2012-11-16,-9.68,0
2013-01-02 00:00:00+00:00,Equity(114 [ADBE]),2012-12-14,6.52,6
2013-01-02 00:00:00+00:00,Equity(122 [ADI]),2012-11-28,1.75,4
2013-01-02 00:00:00+00:00,Equity(128 [ADM]),2012-10-24,13.64,8


In [16]:
def get_quarter(date):
    date = pd.Timestamp(date)
    return "Q{}_{}".format(str(date.quarter), str(date.year))

def compute_deciles(decile_data):
    decile_data = decile_data.reset_index()
    quarters = decile_data['level_0'].unique()
    quarterly_deciles = {}
    
    for quarter in quarters:
        quarter_str = get_quarter(quarter)
        quarter_data = decile_data[decile_data['level_0'] == quarter]
        deciles = []
        for decile in range(0, 9):
            deciles.append(quarter_data[quarter_data['LagESurp Decile'] == decile]['LagESurp'].max())

        quarterly_deciles[quarter_str] = deciles
        
    return quarterly_deciles

compute the deciles cutoff

In [17]:
deciles = compute_deciles(decile_data)
print deciles['Q1_2013']

[-9.5199999999999996, -2.6000000000000001, 0.0, 0.97999999999999998, 2.3500000000000001, 4.0800000000000001, 7.1399999999999997, 13.33, 21.100000000000001]


determine the decile of equities in stock_data using the cutoff

In [18]:
# add quarter column
stock_data['Quarter'] = [get_quarter(x) for x in stock_data['Current Day']]

# add decile for each stock in our dataframe
for idx, row in stock_data.iterrows():
    cutoffs = deciles[row['Quarter']]
    lag_ea_surp = row['LagEaSurp']
    dec = bisect.bisect_left(cutoffs, lag_ea_surp)
    stock_data.set_value(idx, 'Decile', dec)

In [19]:
stock_data = stock_data.reset_index()
stock_data.head()

Unnamed: 0,Earnings Announcement,Equity,Returns 2 days after,Current Day,Average Dollar Volume,Current Price,LagEaSurp,Sector,Quarter,Decile
0,2013-01-04,Equity(41462 [MOS]),0.010936,2013-01-03 00:00:00+00:00,131366200.0,58.06,-12.17,101.0,Q1_2013,0.0
1,2013-01-08,Equity(22140 [MON]),0.024255,2013-01-07 00:00:00+00:00,183930600.0,95.56,0.0,101.0,Q1_2013,2.0
2,2013-01-08,Equity(24829 [APOL]),-0.09555,2013-01-08 00:00:00+00:00,38902500.0,22.04,6.12,205.0,Q1_2013,6.0
3,2013-01-09,Equity(24873 [STZ]),-0.001649,2013-01-08 00:00:00+00:00,41649290.0,36.52,31.48,205.0,Q1_2013,9.0
4,2013-01-11,Equity(8151 [WFC]),-0.001142,2013-01-10 00:00:00+00:00,709165700.0,34.847,1.15,103.0,Q1_2013,4.0
