<font size="6"><center>**Walk-through**</font>

This notebooks attemps to guide newcomer quants in their first steps to become rich

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%run ../nb_config.py

running notebook configuration


In [3]:
import numpy as np
import pandas as pd
import os

# Parameters

In [4]:
start_dt = pd.Timestamp('2015-01-01')
end_dt = pd.Timestamp('2017-12-31')
split_dt = pd.Timestamp('2017-07-31')
tau = 5

In [5]:
ticker = 'AMZN'

# Load Data

`folder structure:`  
|-notebooks/  
|-data/  
|--raw/  
|---wiki_prices.csv

In [6]:
wiki_prices_path = os.path.join("..", "data", "raw", "wiki_prices.csv")  #
wiki_prices = pd.read_csv(wiki_prices_path, parse_dates=['date'], index_col=['ticker', 'date'])

In [7]:
pd.concat([wiki_prices.head(2), wiki_prices.tail(2)], axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A,1999-11-18,45.5,50.0,40.0,44.0,44739900.0,0.0,1.0,31.042,34.112,27.2896,30.0186,44739900.0
A,1999-11-19,42.94,43.0,39.81,40.38,10897100.0,0.0,1.0,29.2954,29.3363,27.16,27.5489,10897100.0
ZUMZ,2018-03-26,23.75,24.8,23.7,24.65,375320.0,0.0,1.0,23.75,24.8,23.7,24.65,375320.0
ZUMZ,2018-03-27,24.65,24.65,23.35,23.6,403884.0,0.0,1.0,24.65,24.65,23.35,23.6,403884.0


In [8]:
wiki_prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15389314 entries, ('A', Timestamp('1999-11-18 00:00:00')) to ('ZUMZ', Timestamp('2018-03-27 00:00:00'))
Columns: 12 entries, open to adj_volume
dtypes: float64(12)
memory usage: 1.4+ GB


In [9]:
wiki_meta_path = os.path.join("..", "data", "raw", "us_equities_meta_data.csv")  #
wiki_prices_meta = pd.read_csv(wiki_meta_path, index_col='ticker')

wiki_prices_meta.head()

Unnamed: 0_level_0,name,lastsale,marketcap,ipoyear,sector,industry
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PIH,"1347 Property Insurance Holdings, Inc.",7.2001,43090000.0,2014.0,Finance,Property-Casualty Insurers
PIHPP,"1347 Property Insurance Holdings, Inc.",25.62,,,Finance,Property-Casualty Insurers
TURN,180 Degree Capital Corp.,2.26,70330000.0,,Finance,Finance/Investors Services
FLWS,"1-800 FLOWERS.COM, Inc.",12.9,833390000.0,1999.0,Consumer Services,Other Specialty Stores
FCCY,1st Constitution Bancorp (NJ),21.3,178140000.0,,Finance,Savings Institutions


# Data Preparation

In [10]:
# use a rule to filter tickers of interest
wiki_smpl = (wiki_prices
             .reset_index()
             .merge(wiki_prices_meta['sector'], on='ticker', how='left')
             .set_index(['ticker', 'date'])
            )
wiki_smpl['sector'] = wiki_smpl['sector'].fillna('NA')

## Targets

In [11]:
def log_returns(prices: pd.Series, tau: int) -> pd.Series:
    """
    Compute log returns on time series data
    log(a/b) = log(a) - log(b)
    @param prices: time series data
    @param tau: differencing period
    @return: returns
    """
    return np.log(prices).diff(tau)

In [12]:
log_rets = (wiki_smpl
          .groupby('ticker')['adj_close']
          .transform(lambda grp: log_returns(grp, tau=-tau))
          )

In [13]:
targets = -1*log_rets
targets.name = 'target'

## Features

Features are also called **Alpha factors** (See S2 slides) and are computed on past data
Some features are based on returns (like targets, but are backwards returns), other on prices and other features on alternative data (sentiment analysis in financial analysis reports,)

In [14]:
feat_1yr_mom = (wiki_smpl
            .groupby('ticker')['adj_close']
            .pct_change(252)
               )
feat_1yr_mom.name = 'x_mon_1yr'

In [15]:
feat_5d_mom = (wiki_smpl
            .groupby('ticker')['adj_close']
            .apply(lambda grp: log_returns(grp, tau=5))
              )
feat_5d_mom.name = 'x_mon_5d'

In [16]:
feat_close_vs_open = wiki_smpl['adj_close']-wiki_smpl['adj_open']
feat_close_vs_open.name = 'x_close_vs_open'

## Join and Filter

In [17]:
features_and_targets = pd.concat([targets, wiki_smpl['sector'], feat_1yr_mom, feat_5d_mom, feat_close_vs_open], axis=1)

dates = features_and_targets.index.get_level_values('date')
mask_dates = (start_dt <=dates) & (dates <= end_dt)

features_and_targets = (
    features_and_targets
    .loc[mask_dates
         & (features_and_targets['sector'] != 'NA')
    ]
)
# TODO: filtering the entire population is needed in order to:
# get a subset of time
# a sample of stocks that are tradable, many options like highly traded stocks or stocks in SP500 list

In [18]:
features_and_targets.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,target,sector,x_mon_1yr,x_mon_5d,x_close_vs_open
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2015-01-02,0.0007,Capital Goods,0.0079,-0.0115,-0.6028
A,2015-01-05,0.0078,Capital Goods,-0.0234,-0.0365,-0.5056


# Feature Enginering

In [19]:
features_and_targets.corr('spearman')

Unnamed: 0,target,x_mon_1yr,x_mon_5d,x_close_vs_open
target,1.0,0.0009,-0.0231,-0.0135
x_mon_1yr,0.0009,1.0,0.1242,0.0457
x_mon_5d,-0.0231,0.1242,1.0,0.3232
x_close_vs_open,-0.0135,0.0457,0.3232,1.0


In [20]:
features_and_targets['x_mon_1yr_dm'] = (
    features_and_targets
    .groupby(['sector', 'date'])['x_mon_1yr']
    .transform(lambda grp: grp-grp.mean())
) 


In [21]:
features_and_targets[['x_mon_1yr_dm', 'target']].corr('spearman')

Unnamed: 0,x_mon_1yr_dm,target
x_mon_1yr_dm,1.0,0.0138
target,0.0138,1.0


In [22]:
# you can use other metrics like accuracy,
# or translate it to weights and then compute a pnl

pd.crosstab(
    index=features_and_targets['x_mon_1yr_dm']>=0,
    columns=features_and_targets['target']>=0,
    normalize=True
)

target,False,True
x_mon_1yr_dm,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.2513,0.2756
True,0.2182,0.2549


In [23]:
# * is eval metric consistent over time?
# * is eval metric consistent over tickers
# Preliminary stage of feature building and refinement can be done in train
# if there is no future data involved on calculations/selections
# final feature selection and feature engineering techniques,
# should be done only on train.
# In this case, sector neautralization (demeaning) can be considered part of the basic feature computation,
# comparion is only shown for demostrative purposes