In [None]:
import sys
sys.path.append('../..')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats as st
from random import random
from datetime import datetime
from tqdm import tqdm
from exploration.config import mongo_inst
from mlpp.data_collection import curve_utils
from mlpp.data_collection.sample import ScoresSubset
from mlpp.data_collection.sample_func import displacement_err, SampleConfig, SampleFunctionGenerator as SFG

In [None]:
NUM_BINS = 200
MAX_PP = 7000
DATE_LIMIT = datetime(2019,1,1)
SAMPLE_CONFIG = SampleConfig(date_limit = DATE_LIMIT, max_pp = MAX_PP, n_bins = NUM_BINS)

osu_random_db = mongo_inst['osu_random_db']
osu_subset = ScoresSubset(osu_random_db['osu_scores_high'], osu_random_db['osu_user_stats'])
subset_3k = ScoresSubset(osu_random_db['scores_sample_3k'], osu_random_db['users_sample_3k'])

generator = SFG(osu_subset, SAMPLE_CONFIG)

In [None]:
greedy_sample_func, predicted_uniformity_arr, bins_hist = generator.greedy(.05)

In [None]:
np.save("greedy_sample_func_5_pcnt.npy", greedy_sample_func)

In [None]:
bins = np.linspace(0, MAX_PP, num=NUM_BINS + 1, dtype=int)
x1 = np.arange(0, 7000)

windows = [3, 5, 10, 15]
rolling_coords = [curve_utils.roll(bins[:-1], greedy_sample_func, w) for w in windows]

In [None]:
linear_fits = [curve_utils.linear_expon_fit(x,y)(x1) for x,y in rolling_coords]

plt.figure(figsize=(10, 6))

plt.plot(bins[:-1], greedy_sample_func, color='lightgray', label = 'original')
for i, fit in enumerate(linear_fits):
    plt.plot(fit, linestyle = 'dashed', label = f'w-size: {windows[i]}')

plt.ylim((0, .5))
plt.title("Linear Exponential fits")
plt.legend(loc='upper left')

In [None]:
p0 = (1.68881617e-03, 1.00011615e+00, 6.36903398e+00) # from experimentation

optimize_fits = [curve_utils.optimize_expon_fit(x, y, p0)(x1) for x,y in rolling_coords]

plt.figure(figsize=(10, 6))
plt.plot(bins[:-1], greedy_sample_func, color='lightgray', label = 'original')
for i, fit in enumerate(optimize_fits):
    plt.plot(fit, label = f'w-size: {windows[i]}')

plt.ylim((0, .5))
plt.title("Optimize Exponential fits")
plt.legend(loc='upper left')

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(24, 6))

for i in range(3):
    axs[i].set_title(f"w-size {windows[i + 1]}%: Linear vs Optimize")
    axs[i].plot(linear_fits[i + 1], label = 'linear')
    axs[i].plot(optimize_fits[i + 1], label = 'optimize')
    axs[i].legend(loc = 'upper left')

In [None]:
greedy_trials = {}
greedy_trials['fit'] = [f'optimize w-{w}' for w in windows] + [f'linear w-{w}' for w in windows]

In [None]:
for i in range(4):
    samples_pp = [osu_subset.simulate(fit, SAMPLE_CONFIG) for fit in tqdm(optimize_fits + linear_fits)]
    greedy_trials[f'trial{i + 1}'] = list(map(displacement_err, samples_pp))

In [None]:
df = pd.DataFrame(greedy_trials)
col = df.loc[:, 'trial1': 'trial4']
df['mean'] = col.mean(axis=1)

df.style.highlight_min(color = 'blue', axis = 0) 

In [None]:
greedy_best_fit = linear_fits[2]

generator_3k = SFG(subset_3k, SAMPLE_CONFIG)
pdf_fit = generator_3k.pdf(st.recipinvgauss, .01 / .77)

In [None]:
plt.plot(greedy_best_fit, label = "greedy")
plt.plot(pdf_fit, label = "pdf")
plt.legend(loc = "upper left")

In [None]:
def test_fit(fit, subset = osu_subset, cnt = 10):
    size = err_cnt = 0
    for i in tqdm(range(cnt)):
        sample_pp = subset.simulate(fit, SAMPLE_CONFIG)
        err_cnt += displacement_err(sample_pp)
        size += len(sample_pp)
    
    print(f'\navg error: {err_cnt / cnt}')
    print(f'\navg sample size: {size / cnt}')

In [None]:
test_fit(greedy_best_fit)

In [None]:
test_fit(pdf_fit)

In [None]:
_ = plt.hist(osu_subset.simulate(greedy_best_fit, SAMPLE_CONFIG), bins = 50, density = True, alpha=0.7, label='greedy')
_ = plt.hist(osu_subset.simulate(pdf_fit, SAMPLE_CONFIG), bins = 50, density = True, alpha=0.7, label='pdf')
_ = plt.plot(np.arange(0,7000), np.full(7000, 1/7000))
plt.legend(loc='upper right')

In [None]:
 val_random_db = mongo_inst['val_random_db']
 val_subset = ScoresSubset(val_random_db['osu_scores_high'], val_random_db['osu_user_stats'])

In [None]:
test_fit(greedy_best_fit, val_subset, 50)

In [None]:
test_fit(pdf_fit, val_subset, 50)

In [None]:
_ = plt.hist(val_subset.simulate(greedy_best_fit, generator), bins = 50, density = True, alpha=0.7, label='greedy')
_ = plt.hist(val_subset.simulate(pdf_fit, generator_3k), bins = 50, density = True, alpha=0.7, label='pdf')
_ = plt.plot(np.arange(0,7000), np.full(7000, 1/7000))
plt.legend(loc='upper right')

In [None]:
np.save("greedy_sample_func.npy", greedy_best_fit)
np.save("pdf_sample_func.npy", pdf_fit)