In [None]:
from itertools import combinations
from functools import partial
import numpy as np
from numpy.linalg import norm
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [None]:
# import yfinance as yf

# def load_close_prices(tickers):
# #     prices are not adjusted for splits or dividends
#     history = {
#         tic: yf.Ticker(tic).history(period='max')
#         for tic in tickers}
#     indexes = [set(data.index) for data in history.values()]
#     index = sorted(set.union(*indexes))
#     closes = pd.concat([
#         history[tic].reindex(index=index)['Close'].ffill()
#         for tic in tickers], axis=1)
#     closes.columns = tickers
#     stacked = closes.stack().reset_index()
#     stacked.columns = ['date', 'ticker', 'price']
#     return stacked

# yf_tickers = [
#     'nio', 'dwac', 'edu', 'didi', 'gme', 'aapl', 'tsla', 'amc', 'pg', 'f', 'snap', 'amzn',
#     'dis', 'msft', 'ge', 'rivn', 'bros', 'goog', 'googl', 'ccl', 'amd', 'nvda']
# prices = (
#     load_close_prices(yf_tickers)
#     .pivot(index='date', columns='ticker', values='price')
# )
# prices.to_parquet('prices_yf.parquet')

# Normalized prices and " returns "

In [None]:
prices = pd.read_parquet('prices_yf.parquet')
THRES = 0.8
nulls = prices.isnull().mean(axis=0)
stocks_w_nans = nulls[nulls > THRES].sort_values()
stocks_w_nans

In [None]:
cols = prices.columns
stocks = cols[~cols.isin(stocks_w_nans.index)]
prices = (
    prices
    .loc[:, stocks]
    .ffill()
    .dropna())
returns = prices.pct_change().iloc[1:, :]  # not clean
prices

In [None]:
normalizations = prices.iloc[0, :]
normalized_prices = (prices / normalizations).iloc[1:, :]

# Cointegration criteria

In [None]:
def returns_corr(cpl, method, returns):
    # method in {‘pearson’, ‘kendall’, ‘spearman’}
    return returns.loc[:, cpl].corr(method=method).iloc[0, 1]

def diff_prices(cpl, normalized_prices):
    prices_0 = normalized_prices.loc[:, cpl[0]]
    prices_1 = normalized_prices.loc[:, cpl[1]]
    return norm(prices_0 - prices_1)

def ratio_prices(cpl, normalized_prices):
    prices_0 = normalized_prices.loc[:, cpl[0]]
    prices_1 = normalized_prices.loc[:, cpl[1]]
    return norm(prices_0 / prices_1 - 1)

def diff_ranks(cpl, normalized_prices):
    prices_ranks_0 = normalized_prices.loc[:, cpl[0]].rank()
    prices_ranks_1 = normalized_prices.loc[:, cpl[1]].rank()
    return norm(prices_ranks_0 - prices_ranks_1)

def ratio_ranks(cpl, normalized_prices):
    prices_ranks_0 = normalized_prices.loc[:, cpl[0]].rank()
    prices_ranks_1 = normalized_prices.loc[:, cpl[1]].rank()
    return norm(prices_ranks_0 / prices_ranks_1 - 1)

methods = ['pearson', 'kendall', 'spearman']
funs = [
    diff_prices,
    diff_ranks,
    ratio_prices,
    ratio_ranks]

In [None]:
def cointegration_criteria(cpl, returns, normalized_prices):
    crits = {
        f'corr_returns_{method}': returns_corr(cpl, method, returns)
        for method in methods}
    crits.update({fun.__name__: fun(cpl, normalized_prices) for fun in funs})
    return pd.Series(crits)

def eval_crits_n_hierarchical_agg(returns, normalized_prices, agg='mean'):
    stocks = normalized_prices.columns
    index = pd.DataFrame(data=combinations(stocks, 2))
    index.columns = 'stock_' + index.columns.astype(str)
    coint_crit = partial(
        cointegration_criteria, returns=returns, normalized_prices=normalized_prices)
    ranks_crits = index.apply(coint_crit, axis=1).rank()
    groups = [fun.__name__.split('_')[-1] for fun in funs] + ['corr', agg]
    for group in groups:
        members = ranks_crits.columns.str.contains(group)
        ranks_crits[f'{group}_{agg}'] = getattr(
            ranks_crits.loc[:, members], agg)(axis=1)
    ranks_crits = pd.concat([index, ranks_crits], axis=1).set_index(list(index.columns))
    return (
        ranks_crits
        .sort_values(by=f'{agg}_{agg}')
#         .reset_index(drop=True)
    )

In [None]:
%%time
ranks_crits = eval_crits_n_hierarchical_agg(returns, normalized_prices)
ranks_crits

In [None]:
cpl = ranks_crits.iloc[0].name
normalized_prices.loc[:, cpl].plot(grid=True)

In [None]:
cpl = ranks_crits.iloc[-1].name
normalized_prices.loc[:, cpl].plot(grid=True)

# Compute spread

In [None]:
def compute_spreads(ranks_crits, spread_comp, **kwargs):
    index = ranks_crits.index
    spreads = (
        index
        .to_frame(index=False)
        .T
        .apply(partial(spread_comp, **kwargs)))
    spreads.columns = index
    return spreads

def spread_simple(cpl, prices):
    prices_0 = prices.loc[:, cpl[0]]
    prices_1 = prices.loc[:, cpl[1]]
    return prices_0 - prices_1

def spread_lin_reg(cpl, prices, fit_intercept=False):
    xxx = prices.loc[:, cpl[0]].values.reshape(-1, 1)
    yyy = prices.loc[:, cpl[1]]
    lin_reg = LinearRegression(fit_intercept=fit_intercept)
    lin_reg.fit(xxx, yyy)
    spread = np.dot(xxx, lin_reg.coef_) + lin_reg.intercept_ - yyy
    return spread

In [None]:
spreads_simple = compute_spreads(ranks_crits, spread_simple, prices=normalized_prices)
spread = spreads_simple.iloc[:, 0]
spread.plot(grid=True)
spread.describe()

In [None]:
prcs = {'prices': prices, 'n_prices': normalized_prices}
spreads_lin_regs = {
    (name_prc, fit_intercept): compute_spreads(
        ranks_crits, spread_lin_reg, prices=prc, fit_intercept=fit_intercept)
    for name_prc, prc in prcs.items()
    for fit_intercept in [True, False]
}

In [None]:
for params, spreads in spreads_lin_regs.items():
    print(params)
    spread = spreads.iloc[:, 0]
    spread.plot(grid=True)
    print(spread.describe())
    plt.show()