In [None]:
import numpy as np
# import pandas as pd
import matplotlib.pyplot as plt

# Utils

In [None]:
def cdf(rands):
    return np.array(sorted(rands)), np.linspace(1 / len(rands), 1, len(rands))

def interpolate(sorted_vals_1, probs_1, sorted_vals_2, probs_2):
    xxx = np.concatenate([sorted_vals_1, sorted_vals_2])
    left = max(min(sorted_vals_1), min(sorted_vals_2))
    right = min(max(sorted_vals_1), max(sorted_vals_2))
    xxx = sorted(xxx[(xxx >= left) & (xxx <= right)])
    yyy_1 = np.interp(xxx, sorted_vals_1, probs_1)
    yyy_2 = np.interp(xxx, sorted_vals_2, probs_2)
    return xxx, yyy_1, yyy_2

def ks_dist(sorted_vals_1, probs_1, sorted_vals_2, probs_2):
    _, yyy_1, yyy_2 = interpolate(sorted_vals_1, probs_1, sorted_vals_2, probs_2)
    return np.abs(yyy_1 - yyy_2).max()

# Examples

In [None]:
rng = np.random.default_rng(0)
N_PTS = 10

In [None]:
rands_1 = rng.standard_normal(N_PTS)
rands_2 = rng.standard_normal(N_PTS)
plt.scatter(rands_1, rands_2)
plt.grid(True)

## Some plots

In [None]:
plt.scatter(sorted(rands_1), sorted(rands_2))
plt.grid(True)

In [None]:
_ = plt.hist(rands_1, bins=5)
_ = plt.hist(rands_2, bins=5, alpha=0.3)
plt.grid(True)

## Cdf

In [None]:
vals_1, probs_1 = cdf(rands_1)
plt.plot(vals_1, probs_1)
plt.grid(True)

vals_2, probs_2 = cdf(rands_2)
plt.plot(vals_2, probs_2)
plt.grid(True)

## Interpolate

In [None]:
xxx, yyy_1, yyy_2 = interpolate(vals_1, probs_1, vals_2, probs_2)

In [None]:
plt.plot(xxx, yyy_1)
plt.plot(vals_1, probs_1, 'o')
plt.grid(True)

In [None]:
plt.plot(xxx, yyy_2)
plt.plot(vals_2, probs_2, 'o')
plt.grid(True)

In [None]:
plt.plot(xxx, yyy_1)
plt.plot(xxx, yyy_2)
plt.grid(True)

## KS distance

In [None]:
ks_dist(vals_1, probs_1, vals_2, probs_2)

In [None]:
%%time
n_pts = np.logspace(1, 6, 20).astype(int)
dists = [ks_dist(*cdf(rng.standard_normal(n_pt)), *cdf(rng.standard_normal(n_pt))) for n_pt in n_pts]
plt.loglog(n_pts, dists)
plt.grid()