In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
dates = pd.bdate_range('2000', '2020')
n_dates = len(dates)
xs_len = 4000
IND = 150

def plot_desc(weights):
    descs = weights.T.describe().T
    for cols in [['count'], ['mean', 'std'], ['min', '25%', '50%', '75%', 'max']]:
        plt.figure()
        descs.loc[:, cols].plot()
        plt.show()

In [None]:
membership = pd.DataFrame(
    data=np.random.rand(n_dates, xs_len) > 0.75,
    index=dates)
membership.sum(axis=1).plot()

weights = pd.DataFrame(
    data=np.random.randn(n_dates, xs_len),
    index=dates).where(membership)
weights.count(axis=1).plot()

betas = pd.DataFrame(
    data=np.random.randn(n_dates, xs_len),
    index=dates).where(membership)
betas.count(axis=1).plot()

In [None]:
plot_desc(weights)

In [None]:
%%time

def standard_scaler(weights):
    means = weights.mean(axis=1)
    stds = weights.std(axis=1)
    return weights.sub(means, axis=0).div(stds, axis=0)

standardized = standard_scaler(weights)

In [None]:
plot_desc(standardized)

In [None]:
weights.iloc[IND].hist(bins=100, alpha=0.3)
standardized.iloc[IND].hist(bins=100)

In [None]:
%%time

def uniform_ranks_scaler(weights):
    ranks = weights.rank(axis=1)
    means = ranks.mean(axis=1)
    beta = (
        ranks
        .agg(['min', 'max'], axis=1)
        .sub(means, axis=0)
        .abs()
        .max(axis=1)
        .rdiv(1))
    return ranks.mul(beta, axis=0).add(-beta * means, axis=0)

uniformized = uniform_ranks_scaler(weights)

In [None]:
plot_desc(uniformized)

In [None]:
weights.iloc[IND].hist(bins=100, alpha=0.3)
uniformized.iloc[IND].hist(bins=100)

In [None]:
%%time

def proj_hyperplane(weights, betas):
    sqr_norm = np.dot(betas, betas)
    assert sqr_norm > 1e-6, 'betas is too close to 0'
    signed_dist = np.dot(weights, betas)
    return weights - signed_dist / sqr_norm * betas

def neutralize(weights, betas, membership):
    weights_t = weights.where(membership).fillna(0).T
    betas_t = betas.where(membership).fillna(0).T
    return (
        weights_t.combine(betas_t, proj_hyperplane)
        .T.where(membership))

neutralized_standardized = neutralize(standardized, betas, membership)
neutralized_uniformized = neutralize(uniformized, betas, membership)

In [None]:
standardized.mul(betas).sum(axis=1).plot()
neutralized_standardized.mul(betas).sum(axis=1).plot()

In [None]:
plot_desc(neutralized_standardized)

In [None]:
standardized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_standardized.iloc[IND].hist(bins=100)

In [None]:
uniformized.mul(betas).sum(axis=1).plot()
neutralized_uniformized.mul(betas).sum(axis=1).plot()

In [None]:
plot_desc(neutralized_uniformized)

In [None]:
uniformized.iloc[IND].hist(bins=100, alpha=0.3)
neutralized_uniformized.iloc[IND].hist(bins=100)