In [None]:
from functools import reduce
from multiprocessing import Pool
from scipy.linalg import qr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from fastcluster import linkage
from scipy.cluster.hierarchy import fcluster, leaves_list, optimal_leaf_ordering
from collections import Counter
from itertools import islice

# Generic

In [None]:
def partial_stack(frame, to_stack, new_names):
    assert set(to_stack) < set(frame)
    assert isinstance(new_names, list) and len(new_names) == 2
    cols = frame.columns
    index = list(cols[~cols.isin(to_stack)])
    frame = frame.set_index(index).stack().reset_index()
    frame.columns = index + new_names
    return frame

# Load data from Yahoo Finance

In [None]:
def load_close_prices(tickers):
#     prices are not adjusted for splits or dividends
    history = {
        tic: yf.Ticker(tic).history(period='max')
        for tic in tickers}
    indexes = [set(data.index) for data in history.values()]
    index = sorted(set.union(*indexes))
    closes = pd.concat([
        history[tic].reindex(index=index)['Close'].ffill()
        for tic in tickers], axis=1)
    closes.columns = tickers
    stacked = closes.stack().reset_index()
    stacked.columns = ['date', 'ticker', 'price']
    return stacked

In [None]:
%%time
tickers = ['msft', 'aapl', 'goog']
closes = load_close_prices(tickers)
closes

# Signal preprocessing

In [None]:
def gen_membership(xs_len, mean_prop):
    dates = pd.bdate_range('2010', '2020')
    membership = np.random.rand(len(dates), xs_len) < mean_prop
    return pd.DataFrame(index=dates, data=membership)

def gen_data(membership):
    data = np.random.randn(*membership.shape)
    return (
        pd.DataFrame(index=membership.index, data=data)
        .where(membership))

def plot_desc(descs, **plot_pars):
    cols = descs.columns
    quantiles = list(cols[~cols.isin(['count', 'mean', 'std'])])
    for cols in [
        {'count'} & set(cols),
        sorted({'mean', 'std'} & set(cols)),
        quantiles]:
        plt.figure()
        descs.loc[:, cols].plot(grid=True, **plot_pars)
        plt.show()

def plot_desc_pfo_matrix(weights, percentiles=None, **plot_pars):
    descs = weights.T.describe(percentiles).T
    plot_desc(descs, **plot_pars)

def standard_scaler(weights):
    means = weights.mean(axis=1)
    stds = weights.std(axis=1)
    return weights.sub(means, axis=0).div(stds, axis=0)

def min_diff(line):
    diffs = line.sort_values().diff()
    return diffs[diffs > 0].min()

def perturbation_with_ranks_preserved_if_different(weights, seed=0):
    rng = np.random.default_rng(seed)
    random_signs = np.sign(rng.uniform(-1, 1, weights.shape))  # a.s. != 0
    min_diffs = weights.apply(min_diff, axis=1)
    random_signs = pd.DataFrame(data=random_signs, index=weights.index, columns=weights.columns)
    return weights + random_signs.mul(0.4 * min_diffs, axis=0)  # < 0.5

def ranks_uniform_transformer(weights, abso=True, break_ties=False, seed=0):
    # Random seed `seed` is used to break ties (if break_ties=True)
    if break_ties:
        weights = perturbation_with_ranks_preserved_if_different(weights, seed)
    ranks = weights.rank(axis=1)
    means = ranks.mean(axis=1)
    beta = (
        ranks
        .agg(['min', 'max'], axis=1)
        .sub(means, axis=0)
        .abs()
        .max(axis=1)
        .rdiv(1))
    unif = ranks.mul(beta, axis=0).add(-beta * means, axis=0)
    if not abso:
        unif = unif.add(1).div(2)
    return unif

In [None]:
IND = 150
membership = gen_membership(5000, 0.5)

weights = gen_data(membership)
betas = gen_data(membership)
plot_desc_pfo_matrix(weights)

In [None]:
# test of perturbation_ranks_preserved_if_different
weights_noised = perturbation_with_ranks_preserved_if_different(weights)
ranks = weights.rank(axis=1)
ranks_noised = weights_noised.rank(axis=1)
are_equal_ranks = (ranks != ranks_noised).where(membership)
are_equal_ranks.sum(axis=1).plot(grid=True)
(ranks.fillna(0) != ranks_noised.fillna(0)).sum().sum(), (weights == weights_noised).sum().sum()

In [None]:
standardized = standard_scaler(weights)
plot_desc_pfo_matrix(standardized)

In [None]:
weights.iloc[IND].hist(bins=100, alpha=0.3)
standardized.iloc[IND].hist(bins=100)

In [None]:
uniformized = ranks_uniform_transformer(weights)
plot_desc_pfo_matrix(uniformized)

In [None]:
weights.iloc[IND].hist(bins=100, alpha=0.3)
uniformized.iloc[IND].hist(bins=100)

# Neutralize

## Utils and tests

In [None]:
def proj_line(inner_prod, vec, unit_dir_vec):
    return inner_prod(vec, unit_dir_vec) * unit_dir_vec

def normalize(inner_prod, vec):
    return vec * inner_prod(vec, vec) ** -.5

def proj_orthonormal_basis(inner_prod, vec, orthon_basis):
    return vec - sum([proj_line(inner_prod, vec, vec_dir) for vec_dir in orthon_basis])

def canonical_orthonormalize(vecs):
    basis = qr(np.array(vecs).T, mode='economic')[0]
    return list(basis.T)

def gram_schmidt_process(inner_prod, vecs):
    append_one = lambda vecs_so_far, vec: vecs_so_far + [normalize(
        inner_prod, proj_orthonormal_basis(inner_prod, vec, vecs_so_far))]
    return reduce(append_one, vecs, [])

def ortho_proj(inner_prod, vec, vecs):
    process = (
        canonical_orthonormalize if inner_prod.__name__ == 'dot' else
        lambda x: gram_schmidt_process(inner_prod, x))
    orthon_basis = process(vecs)
#     orthon_basis = gram_schmidt_process(inner_prod, vecs)
    return proj_orthonormal_basis(inner_prod, vec, orthon_basis)

def gram_matrix(inner_prod, vecs):
    return np.array([[inner_prod(vec_1, vec_2) for vec_1 in vecs] for vec_2 in vecs])

def proj_hyperplane(inner_prod, vec, or_vec):
    basis = [normalize(inner_prod, or_vec)]
    return proj_orthonormal_basis(inner_prod, vec, basis)

In [None]:
inn = np.dot
vec = np.random.randn(10)
unit_vec = normalize(inn, vec)
np.linalg.norm(unit_vec)

In [None]:
proj = proj_line(inn, vec, unit_vec)
np.abs(inn(proj, unit_vec)), np.linalg.norm(proj) * np.linalg.norm(unit_vec)

In [None]:
vecs = [np.random.randn(10) for _ in range(3)]
orthon_basis = canonical_orthonormalize(vecs)
gram_matrix(inn, orthon_basis)

In [None]:
proj = proj_orthonormal_basis(inn, vec, orthon_basis)
[inn(proj, vec_) for vec_ in orthon_basis]

In [None]:
orthon_basis_2 = gram_schmidt_process(inn, vecs)
gram_matrix(inn, orthon_basis_2)

In [None]:
[vec_1 - vec_2 for vec_1, vec_2 in zip(orthon_basis, orthon_basis_2)]

In [None]:
proj_2 = proj_orthonormal_basis(inn, vec, orthon_basis_2)
[inn(proj_2, vec_) for vec_ in orthon_basis]

In [None]:
proj - proj_2

In [None]:
proj_3 = ortho_proj(inn, vec, vecs)
proj_3 - proj_2

In [None]:
inn = lambda x, y: np.cov(x, y)[0][1]
vec = np.random.randn(10)
unit_vec = normalize(inn, vec)
np.linalg.norm(unit_vec)

In [None]:
proj = proj_line(inn, vec, unit_vec)
np.abs(inn(proj, unit_vec)), np.linalg.norm(proj) * np.linalg.norm(unit_vec)

In [None]:
vecs = [np.random.randn(10) for _ in range(3)]
orthon_basis = canonical_orthonormalize(vecs)
gram_matrix(inn, orthon_basis)

In [None]:
proj = proj_orthonormal_basis(inn, vec, orthon_basis)
[inn(proj, vec_) for vec_ in orthon_basis]

In [None]:
orthon_basis_2 = gram_schmidt_process(inn, vecs)
gram_matrix(inn, orthon_basis_2)

In [None]:
[vec_1 - vec_2 for vec_1, vec_2 in zip(orthon_basis, orthon_basis_2)]

In [None]:
proj_2 = proj_orthonormal_basis(inn, vec, orthon_basis_2)
[inn(proj_2, vec_) for vec_ in orthon_basis]

In [None]:
proj - proj_2

In [None]:
proj_3 = ortho_proj(inn, vec, vecs)
proj_3 - proj_2

In [None]:
or_vec = np.random.randn(10)
proj = proj_hyperplane(inn, vec, or_vec)
inn(proj, or_vec)

## Neutralizer

In [None]:
def neutralize(inn, weights, betas, membership):
    weights_t = weights.where(membership).fillna(0).T
    betas_t = betas.where(membership).fillna(0).T
    return (
        weights_t.combine(betas_t, lambda x, y: proj_hyperplane(inn, x, y))
        .T.where(membership))

In [None]:
neutralized_standardized = neutralize(np.dot, standardized, betas, membership)
neutralized_uniformized = neutralize(np.dot, uniformized, betas, membership)

In [None]:
standardized.mul(betas).sum(axis=1).plot()
neutralized_standardized.mul(betas).sum(axis=1).plot()

In [None]:
plot_desc_pfo_matrix(neutralized_standardized)

In [None]:
standardized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_standardized.iloc[IND].hist(bins=100)

In [None]:
uniformized.mul(betas).sum(axis=1).plot()
neutralized_uniformized.mul(betas).sum(axis=1).plot()

In [None]:
plot_desc_pfo_matrix(neutralized_uniformized)

In [None]:
uniformized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_uniformized.iloc[IND].hist(bins=100)

## Neutralizers

In [None]:
def neutralize_multi(select_date, membership):
    with Pool() as pool:
        residuals = dict(pool.imap_unordered(select_date, membership.index))
    return (
        pd.DataFrame(residuals)
        .T
        .where(membership)
        .sort_index()
        )

factors = [gen_data(membership) for _ in range(5)]
kwargs = {
    'weights_0': standardized.where(membership).fillna(0),
    'factors_0': [factor.where(membership).fillna(0) for factor in factors],
    'inn': lambda x, y: np.cov(x, y)[0][1],
    # inn = lambda x, y: np.ma.cov(np.ma.masked_invalid(x), np.ma.masked_invalid(y))[0][1]
}

In [None]:
%%time
def select_date(date):
    wei = kwargs['weights_0'].loc[date, :]
    facs = [factor.loc[date, :] for factor in kwargs['factors_0']]
    return date, ortho_proj(kwargs['inn'], wei, facs)

neutralized_standardized = neutralize_multi(select_date, membership)

kwargs['weights_0'] = uniformized.where(membership).fillna(0)
neutralized_uniformized = neutralize_multi(select_date, membership)

In [None]:
%%time
corrs_neutralized_standardized = pd.concat([
    neutralized_standardized.corrwith(factor, axis=1) for factor in factors],
    axis=1)
corrs_neutralized_standardized.plot(grid=True)

In [None]:
plot_desc_pfo_matrix(neutralized_standardized)

In [None]:
standardized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_standardized.iloc[IND].hist(bins=100)

In [None]:
%%time
corrs_neutralized_uniformized = pd.concat([
    neutralized_uniformized.corrwith(factor, axis=1) for factor in factors],
    axis=1)
corrs_neutralized_uniformized.plot(grid=True)

In [None]:
plot_desc_pfo_matrix(neutralized_uniformized)

In [None]:
uniformized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_uniformized.iloc[IND].hist(bins=100)

# Clustering correlation matrix

In [None]:
# source: https://gmarti.gitlab.io/qfin/2020/03/22/herc-part-i-implementation.html

In [None]:
def sort_corr(corr_df):
    names = np.array(list(corr_df))
    corr = corr_df.values
    dissimilarities = 1 - corr
    condensed = dissimilarities[np.triu_indices(len(corr_df), k=1)]
    link = linkage(condensed, method='ward')
    perm = leaves_list(optimal_leaf_ordering(link, condensed))
    sorted_corr_df = pd.DataFrame(
        index=names[perm], columns=names[perm], data=corr[perm, :][:, perm])
    return link, perm, sorted_corr_df

def cut_linkage(link, n_clusters):
    c_inds = fcluster(link, n_clusters, criterion='maxclust')
    return sorted(Counter(c_inds).items(), key=lambda x: x[0])

def plot_clusters(sorted_corr_df, clusters_sizes):
    plt.figure(figsize=(8, 8))
    plt.pcolormesh(sorted_corr_df)
#     sns.heatmap(sorted_corr_df)  # import seaborn as sns
    sizes = np.cumsum([0] + [y for _, y in clusters_sizes])
    dim = len(sorted_corr_df)
    for left, right in zip(sizes, sizes[1:]):
        plt.axvline(x=left, ymin=left / dim, ymax=right / dim, color='r')
        plt.axvline(x=right, ymin=left / dim, ymax=right / dim, color='r')
        plt.axhline(y=left, xmin=left / dim, xmax=right / dim, color='r')
        plt.axhline(y=right, xmin=left / dim, xmax=right / dim, color='r')
    cols = iter(list(sorted_corr_df))
    print([list(islice(cols, n_eles)) for _, n_eles in clusters_sizes])
    plt.show()

In [None]:
%%time
yf_tickers = [
    'nio', 'dwac', 'edu', 'didi', 'gme', 'aapl', 'tsla', 'amc', 'pg', 'f', 'snap', 'amzn',
    'dis', 'msft', 'ge', 'rivn', 'bros', 'goog', 'googl', 'ccl', 'amd', 'nvda']
rets = (
    load_close_prices(yf_tickers)
    .pivot(index='date', columns='ticker', values='price')
    .pct_change()
#     .dropna()
)

In [None]:
%%time
corr_df = rets.fillna(0).corr()  # TODO: handle nan more properly...
link, perm, sorted_corr_df = sort_corr(corr_df)

In [None]:
%%time
n_clusters = 6
sizes = cut_linkage(link, n_clusters)
plot_clusters(sorted_corr_df, sizes)