In [None]:
from itertools import combinations, accumulate
from functools import partial
from multiprocessing import Pool
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from archimedean_copulas import copulas
from fit_archimedean_copula import edf, fit_archimedean_copula
import warnings
warnings.filterwarnings('ignore')

# Get univ

In [None]:
prices = pd.read_parquet('prices_yf.parquet')
THRES = 0.8
nulls = prices.isnull().mean(axis=0)
stocks_w_nans = nulls[nulls > THRES].sort_values()
stocks_w_nans

In [None]:
cols = prices.columns
stocks = cols[~cols.isin(stocks_w_nans.index)]
prices = (
    prices
    .loc[:, stocks]
    .ffill(limit=2)
#     .dropna()
)

In [None]:
univ_stacked = pd.concat([
    prices.stack(dropna=False).reset_index(),
    prices.pct_change().stack(dropna=False).reset_index()[0]  # column 0; not clean (dividends, splits, ...)
], axis=1)
univ_stacked.columns = ['date', 'id', 'price', 'chg']  # price is mid_price
CUT = '2007'  # '2006-09-19'
univ_stacked = univ_stacked.query(f'date >= {CUT}')

In [None]:
univ = univ_stacked.pivot(index='date', columns='id')
univ

# Formation period

In [None]:
def compute_coint_coeff(pair, univ):
    prices = univ['price'].loc[:, pair]
    invs = prices.iloc[0].rdiv(1)
    slopes = prices.iloc[-1] * invs / len(prices)
    invs.loc[pair[1]] = -invs.loc[pair[0]] * slopes[pair[0]] / slopes[pair[1]]
    return invs

pair = ('dis', 'pg')
coeff = compute_coint_coeff(pair, univ)
coeff

In [None]:
def form_spread(pair, coeff, prices):
    return prices.loc[:, pair].mul(coeff, axis=1).sum(axis=1)

spread = form_spread(pair, coeff, univ['price'])
spread.plot(grid=True)

In [None]:
def compute_scores(returns, pairs, method):
    # method in {'pearson', 'kendall', 'spearman'}
    return returns.corr(method=method).stack().loc[pairs]

returns = univ['chg']
compute_scores(returns, pair, 'kendall')

In [None]:
def get_scores(univ, method='spearman'):
    pairs = list(combinations(univ['price'].columns, 2))
    coeffs = {pair: compute_coint_coeff(pair, univ) for pair in pairs}
    spreads = pd.DataFrame({
        pair: form_spread(pair, coeff, univ['price']) for pair, coeff in coeffs.items()})
    scores = compute_scores(univ['chg'], pairs, method=method)
    return pairs, coeffs, spreads, scores

In [None]:
pairs, coeffs, spreads, scores = get_scores(univ)
scores

In [None]:
def selection(scores, **kwargs):
    return list(scores.sort_values().head(kwargs['n_pairs']).index)

kwargs = {'n_pairs': 10}
selected = selection(scores, **kwargs)
selected

In [None]:
def get_desc(pair, returns):
    rets_0 = returns.loc[:, pair[0]]
    rets_1 = returns.loc[:, pair[1]]
    fit = fit_archimedean_copula(rets_0, rets_1)
    return rets_0.to_list(), rets_1.to_list(), fit

get_desc(pair, univ['chg'])[2]

In [None]:
%%time

def fit(univ, **kwargs):
    _, coeffs, spreads, scores = get_scores(univ)
    selected = selection(scores, **kwargs)
    coeffs = {pair: coeffs[pair] for pair in selected}
    spreads = spreads[selected]
    descs = {pair: get_desc(pair, univ['chg']) for pair in selected}
    return selected, coeffs, spreads, descs, scores[selected]

kwargs = {'n_pairs': 10}
top_pairs, coeffs, spreads, descs, scores = fit(univ, **kwargs)
pd.concat([desc[2].rename(pair) for pair, desc in descs.items()], axis=1)

# CPU times: user 2.15 s, sys: 13 ms, total: 2.16 s
# Wall time: 2.19 s

# Trading period

## Trading rule

In [None]:
def enter_trade(desc, rets_0_cur, rets_1_cur, proba=0.95):
    rets_0_fit, rets_1_fit, copula_fit = desc
    uuu = edf(pd.Series(rets_0_fit + [rets_0_cur]))[-1]
    vvv = edf(pd.Series(rets_1_fit + [rets_1_cur]))[-1]
    chg_u = copulas[copula_fit['name']][4](uuu, vvv, copula_fit['theta'])
    chg_v = copulas[copula_fit['name']][5](uuu, vvv, copula_fit['theta'])
    new_pos = 0
    if chg_u > proba and chg_v < 1 - proba:
        new_pos = -1
    elif chg_u < 1 - proba and chg_v > proba:
        new_pos = +1
    return new_pos

pair = top_pairs[0]
enter_trade(descs[pair], -0.01, +0.01)

In [None]:
def exit_trade(cur_pos, spread):
    return np.sign(spread) * cur_pos == +1

def exit_n_enter_trade(desc, cur_pos, spread, rets_0_cur, rets_1_cur):
    if cur_pos != 0:
        new_pos = 0 if exit_trade(cur_pos, spread) else cur_pos
    else:
        new_pos = enter_trade(desc, rets_0_cur, rets_1_cur)
    return new_pos

pair = top_pairs[0]
spread = spreads[pair]
exit_n_enter_trade(descs[pair], 0, spread, -0.01, +0.01)

In [None]:
%%time

def trade_series(pair, descs, spreads, returns):
    def inner(cur_pos, data):
        return exit_n_enter_trade(descs[pair], cur_pos, *data)
    spread = spreads.loc[:, pair]
    data = pd.concat([spread, returns.loc[:, pair]], axis=1)
    pos = accumulate(data.values, inner, initial=0)
    return pd.Series(index=spread.index, data=list(pos)[:-1])

pos = trade_series(pair, descs, spreads, returns)
spread = spreads[pair]
spread.plot(grid=True)
pos.mul(spread.max()).plot(grid=True)

# CPU times: user 2.42 s, sys: 7.43 ms, total: 2.43 s
# Wall time: 2.47 s

## On real data

In [None]:
CUT = '2014'
univ_in = univ.loc[:CUT]
univ_out = univ.loc[CUT:]

In [None]:
def trade(univ_out, top_pairs, coeffs, descs):
    spreads = pd.DataFrame({pair: form_spread(
        pair, coeff, univ_out['price']) for pair, coeff in coeffs.items()})
    ad_hoc = lambda pair: trade_series(pair, descs, spreads, univ_out['chg'])
    pos_spreads = pd.DataFrame({pair: ad_hoc(pair) for pair in top_pairs})
    return spreads, pos_spreads

top_pairs, coeffs, spreads, descs, scores = fit(univ_in, n_pairs=10)
spreads, pos_spreads = trade(univ_out, top_pairs, coeffs, descs)

In [None]:
spreads.plot()

In [None]:
pos_spreads.plot()

In [None]:
def pos_spread_to_pos(pos_spread, coeff):
    return pd.DataFrame(
        index=pos_spread.index,
        columns=coeff.index,
        data=pos_spread.values.reshape(-1, 1) * coeff.values.reshape(1, -1)
    )

pair = pos_spreads.columns[3]
pos_spread_to_pos(pos_spreads.loc[:, pair], coeffs[pair])

In [None]:
def net_positions(pos):
    stacked = (
        pd.concat(pos.values(), axis=1)
        .stack()
        .reset_index())
    stacked.columns = ['date', 'id', 'pos']
    return (
        stacked
        .groupby(['date', 'id'])
        ['pos'].sum()
        .reset_index()
        .pivot(index='date', columns='id', values='pos'))

top_pairs, coeffs, _, descs, _ = fit(univ_in, n_pairs=10)
spreads, pos_spreads = trade(univ_out, top_pairs, coeffs, descs)
pos = {pair: pos_spread_to_pos(pos_spreads.loc[:, pair], coeffs[pair]) for pair in top_pairs}
net_pos = net_positions(pos)
net_pos

In [None]:
def fit_n_trade(univ_in, univ_out, n_pairs):
    top_pairs, coeffs, _, descs, _ = fit(univ_in, n_pairs=n_pairs)
    spreads, pos_spreads = trade(univ_out, top_pairs, coeffs, descs)
    pos = {pair: pos_spread_to_pos(pos_spreads.loc[:, pair], coeffs[pair]) for pair in top_pairs}
    return net_positions(pos)

In [None]:
%%timeit
_ = fit_n_trade(univ_in, univ_out, 10)

# 6.46 s ± 174 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

# Rolling

In [None]:
univ = univ.copy()
n_pairs = 10
gcd = 'W-MON'

In [None]:
%%time

dates = pd.Series(index=univ.index, data=0)
cuts = dates.resample(gcd).first().index
splits = [
    ((start, start_2), (start_2, start_3))
    for start, start_2, start_3
    in zip(cuts, cuts[2:], cuts[3:])]

def fit_n_trade_split(split):
    return fit_n_trade(
        univ.loc[split[0][0]:split[0][1]],
        univ.loc[split[1][0]:split[1][1]],
        n_pairs)

with Pool() as pool:
    positions = pd.concat(pool.imap_unordered(fit_n_trade_split, splits)).sort_index()
    
# CPU times: user 3.02 s, sys: 248 ms, total: 3.27 s
# Wall time: 6min 4s

In [None]:
positions[positions.abs().gt(0)].count(axis=1).plot(grid=True)

In [None]:
positions.sum(axis=1).plot(grid=True)

In [None]:
positions.abs().sum(axis=1).plot(grid=True)

In [None]:
positions = (
    positions
    .div(positions.abs().sum(axis=1), axis=0)
    .fillna(0))
positions.abs().sum(axis=1).plot(grid=True)

In [None]:
positions.sum(axis=1).plot(grid=True)

In [None]:
positions.diff().abs().sum(axis=1).plot(grid=True)