In [None]:
from itertools import combinations, accumulate
from functools import partial
from multiprocessing import Pool
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import optimal_ou_threshold, ou_fit, bootstrapped_quantile
import warnings
warnings.filterwarnings('ignore')

In [None]:
COST = 1e-2
PCT = 1e-2
ALPHA = 5e-2

# Get univ

In [None]:
prices = pd.read_parquet('prices_yf.parquet')
THRES = 0.8
nulls = prices.isnull().mean(axis=0)
stocks_w_nans = nulls[nulls > THRES].sort_values()
stocks_w_nans

In [None]:
cols = prices.columns
stocks = cols[~cols.isin(stocks_w_nans.index)]
prices = (
    prices
#     .loc[:, stocks]
#     .ffill(limit=2)
#     .dropna()
)

In [None]:
univ_stacked = pd.concat([
    prices.stack(dropna=False).reset_index(),
    prices.pct_change().stack(dropna=False).reset_index()[0]  # column 0; not clean (dividends, splits, ...)
], axis=1)
univ_stacked.columns = ['date', 'id', 'price', 'chg']  # price is mid_price
CUT = '2007'  # '2006-09-19'
univ_stacked = univ_stacked.query(f'date >= {CUT}')

In [None]:
univ = univ_stacked.pivot(index='date', columns='id')
univ

# Cointegration coefficients and spreads

In [None]:
def compute_coint_coeff(pair, univ):
    prices = univ['price'].loc[:, pair]
    invs = prices.iloc[0].rdiv(1)
    invs.loc[pair[1]] *= -1
    return invs

pair = ('dis', 'pg')
coeff = compute_coint_coeff(pair, univ)
coeff

In [None]:
def form_spread(pair, coeff, prices):
    return prices.loc[:, pair].mul(coeff, axis=1).sum(axis=1)

spread = form_spread(pair, coeff, univ['price'])
spread.plot(grid=True)
spread.mean()

In [None]:
def get_spreads(univ):
    pairs = list(combinations(univ['price'].columns, 2))
    coeffs = {pair: compute_coint_coeff(pair, univ) for pair in pairs}
    spreads = pd.DataFrame({
        pair: form_spread(pair, coeff, univ['price']) for pair, coeff in coeffs.items()})
    return pairs, coeffs, spreads

pairs, coeffs, spreads = get_spreads(univ)

# Trading rule

In [None]:
def trading_rule(cur_pos_spread, st_spread, thres=1):
    if st_spread < -thres:
        new_pos_spread = +1
    elif st_spread > +thres:
        new_pos_spread = -1
    # st_spread in [-kwargs['low'], kwargs['high']]
    else:
        new_pos_spread = cur_pos_spread
    return new_pos_spread

xxx = np.linspace(0, 3, 100)
yyy = 2 * np.sin(3 * xxx)

pos_spread = accumulate(yyy, trading_rule, initial=0)
pos_spread = list(pos_spread)[:-1]
plt.plot(xxx, pos_spread)
plt.plot(xxx, yyy)
plt.grid(True)

In [None]:
def trade_series(st_spread, trade_rule, **kwargs):
    rule = partial(trade_rule, **kwargs)
    pos_spread = accumulate(st_spread.values, rule, initial=0)
    return pd.Series(index=st_spread.index, data=list(pos_spread)[:-1])

xxx = np.linspace(0, 3, 100)
yyy = pd.Series(2 * np.sin(3 * xxx))

pos_spread = trade_series(yyy, trading_rule, thres=1)
plt.plot(xxx, pos_spread)
plt.plot(xxx, yyy)
plt.grid(True)

In [None]:
def trade(univ_out, top_pairs, descs, trade_rule, **kwargs):
    coeffs = {pair: compute_coint_coeff(pair, univ_out) for pair in top_pairs}
    # it is not forward looking as only the first price is used
    spreads = pd.DataFrame({pair: form_spread(
        pair, coeff, univ_out['price']) for pair, coeff in coeffs.items()})
    st_spreads = (spreads - descs.loc['mean', :]) / descs.loc['std', :]
    pos_spreads = st_spreads.apply(partial(trade_series, trade_rule=trade_rule, **kwargs))
    return coeffs, spreads, pos_spreads

In [None]:
def pos_spread_to_pos(pos_spread, coeff):
    return pd.DataFrame(
        index=pos_spread.index,
        columns=coeff.index,
        data=pos_spread.values.reshape(-1, 1) * coeff.values.reshape(1, -1)
    )

pos = pos_spread_to_pos(pos_spread, coeffs[pair])
chgs = univ['chg'].loc[:, pair].shift(-1)
pos.mul(chgs).sum(axis=1)

In [None]:
def net_positions(pos, cols=('date', 'id', 'pos')):
    if not (vals := pos.values()):
        res = None
    else:
        res = (
            pd.concat(vals, axis=1)
            .stack()
            .groupby([cols[0], cols[1]])
            .sum()
            .rename(cols[2])
            .reset_index()
            .pivot(index=cols[0], columns=cols[1], values=cols[2]))
    return res

pairs, _, spreads = get_spreads(univ)
descs = spreads.describe()
# for the time being: describe() --> .apply(partial(get_desc, cost=COST))
coeffs, spreads, pos_spreads = trade(
    univ, pairs, descs, trading_rule, thres=1)
pos = {pair: pos_spread_to_pos(pos_spreads.loc[:, pair], coeffs[pair]) for pair in pairs}
net_pos = net_positions(pos)
net_pos

In [None]:
def rets_pairs(pairs, coeffs, spreads, univ):
    def rets_one_pair(pair):
        pos = pos_spread_to_pos(spreads.loc[:, pair], coeffs[pair])
        chgs = univ['chg'].loc[:, pair].shift(-1)
        return pos.mul(chgs).sum(axis=1).rename(pair)
    return pd.concat([rets_one_pair(pair) for pair in pairs], axis=1)

rets_pairs(pairs, coeffs, spreads, univ)

# Fit OU

In [None]:
def get_desc(spread, cost):
    ou_params = ou_fit(spread.values)
    params = {'mean': ou_params['mean']}
    del ou_params['mean']
    params['std'] = optimal_ou_threshold(**ou_params, cost=cost)
    return pd.Series(params)

In [None]:
pair = pairs[5]
spread = spreads.loc[:, pair]
desc = get_desc(spread, COST)
st_spread = (spread - desc.loc['mean']) / desc.loc['std']
st_spread.plot(grid=True)
print((spread - desc.loc['mean']).mean(), st_spread.mean())
pos_spread = trade_series(st_spread, trading_rule, thres=1)
pos_spread.mul(st_spread.abs().max()).plot(grid=True)

# Selection: Multi Hypotheses Testing

In [None]:
%%time
pairs, coeffs, spreads = get_spreads(univ)
rets = rets_pairs(pairs, coeffs, spreads, univ)
n_fd = int(np.ceil(PCT * rets.shape[1]))
thres = bootstrapped_quantile(rets, n_fd, ALPHA)
selected = rets.mean().div(rets.std()).ge(thres)
selected[selected].index.to_list()

In [None]:
def single_step_selection(rets, n_fd=None, alpha=ALPHA):
    if n_fd is None:
        n_fd = int(np.ceil(PCT * rets.shape[1]))
    thres = bootstrapped_quantile(rets, n_fd, alpha)
    selected = rets.mean().div(rets.std()).ge(thres)
    return selected[selected].index.to_list()

single_step_selection(rets, 2, 0.05)

In [None]:
def fit(univ, n_fd=None, alpha=ALPHA, cost=COST, thres_trade=1):
    pairs, _, spreads = get_spreads(univ)
    descs = spreads.apply(partial(get_desc, cost=cost))
    coeffs, spreads, pos_spreads = trade(
        univ, pairs, descs, trading_rule, thres=thres_trade)
    rets = rets_pairs(pairs, coeffs, spreads, univ)
    selected = single_step_selection(rets, n_fd, alpha)
    coeffs = {pair: coeffs[pair] for pair in selected}
    return selected, coeffs, spreads[selected], descs.loc[:, selected]

top_pairs, coeffs, spreads, descs = fit(univ)
spreads

# Rolling

In [None]:
CUT = '2014'
univ_in = univ.loc[:CUT]
univ_out = univ.loc[CUT:]

In [None]:
def fit_n_trade(univ_in, univ_out, **kwargs):
    top_pairs, coeffs, spreads, descs = fit(univ_in, **kwargs)
    thres = {'thres': kwargs['thres_trade']} if 'thres_trade' in kwargs else {}
    coeffs, _, pos_spread = trade(univ_out, top_pairs, descs, trading_rule, **thres)
    pos = {pair: pos_spread_to_pos(pos_spread.loc[:, pair], coeffs[pair]) for pair in top_pairs}
    return net_positions(pos)

_ = fit_n_trade(univ_in, univ_out)

In [None]:
kwargs = {}

top_pairs, coeffs, spreads, descs = fit(univ_in, **kwargs)
thres = {'thres': kwargs['thres_trade']} if 'thres_trade' in kwargs else {}
coeffs, _, pos_spread = trade(univ_out, top_pairs, descs, trading_rule, **thres)
pos = {pair: pos_spread_to_pos(pos_spread.loc[:, pair], coeffs[pair]) for pair in top_pairs}

In [None]:
univ = univ.copy()
thres = 1
gcd = '2Q'

In [None]:
%%time
dates = pd.Series(index=univ.index, data=0)
cuts = dates.resample(gcd).first().index
splits = [
    ((start, start_2), (start_2 + pd.Timedelta(days=1), start_3))
    for start, start_2, start_3
    in zip(cuts, cuts[2:], cuts[3:])]

def fit_n_trade_split(split):
    return fit_n_trade(
        univ.loc[split[0][0]:split[0][1]],
        univ.loc[split[1][0]:split[1][1]],
        )

with Pool() as pool:
    pos = list(pool.imap_unordered(fit_n_trade_split, splits))
    pos = [p for p in pos if p is not None]
    positions = pd.concat(pos).sort_index() if pos else None

# CPU times: user 110 ms, sys: 23.8 ms, total: 134 ms
# Wall time: 57.2 s

In [None]:
assert positions is not None

In [None]:
positions[positions.abs().gt(0)].count(axis=1).plot(grid=True)

In [None]:
positions.sum(axis=1).plot(grid=True)

In [None]:
positions.abs().sum(axis=1).plot(grid=True)

In [None]:
positions = (
    positions
    .div(positions.abs().sum(axis=1), axis=0)
    .fillna(0))
positions.abs().sum(axis=1).plot(grid=True)

In [None]:
positions.sum(axis=1).plot(grid=True)

In [None]:
positions.diff().abs().sum(axis=1).plot(grid=True)