In [None]:
import sys
sys.path.append('../..')
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from scipy import stats
from scipy.interpolate import UnivariateSpline
from scipy.optimize import curve_fit
from tqdm import tqdm
from random import random
import math
from datetime import datetime
from exploration.config import mongo_inst
from mlpp.data_collection.sample import osuDumpSampler
from mlpp.third_party.curve_fit import best_fit_distribution

In [None]:
NUM_BINS = 200
MAX_PP = 7000
DATE_LIMIT = datetime(2019,1,1)

osu_random_db = mongo_inst['osu_random_db']

In [None]:
def bin_cnt_pipeline(score_filter, bin_width):
    return [
        *score_filter,
        {
            '$set': {
                'range_i': {
                    '$floor': {
                        '$divide': ['$mlpp.est_user_pp', bin_width]
                    }
                }
            }
        },
        {
            '$group': {
                '_id': '$range_i',
                'count': {
                    '$sum': 1
                }
            }
        }, {
            '$sort': {
                '_id': 1
            }
        }
    ]

In [None]:

def compute_sampling_func(prop = .05, date_limit = datetime(2019,1,1), max_pp = 7000, n_bins = 200):
    n_scores = osu_random_db['osu_scores_high'].count()

    bins_score_cnt, sample_func = np.zeros(n_bins), np.zeros(n_bins)
    bin_width = max_pp / n_bins
    bin_cap = math.ceil(n_scores * prop / n_bins)
    bins_hist = []

    for i in tqdm(range(n_bins -1, -1, -1)):
        diff = bin_cap - bins_score_cnt[i]
        pp_floor, pp_ceil = i * bin_width, (i + 1) * bin_width

        if diff > 0:
            user_ids_in_range = [u['_id'] for u in
                osu_random_db['osu_user_stats'].find(
                    {
                        'mlpp.est_current_pp': {
                            '$gte': pp_floor,
                            '$lt': pp_ceil
                        }
                    }, 
                    {
                        '_id': 1
                    }
                )
            ]

            score_filter = [
                {
                    '$match': {
                        'user_id': {
                            '$in': user_ids_in_range
                        },
                        'date': {
                            '$gt': date_limit
                        }
                    }
                }
            ]

            u_bins_score_cnt = np.zeros(n_bins)
            for bin_cnt in osu_random_db['osu_scores_high'].aggregate(bin_cnt_pipeline(score_filter, bin_width)):
                u_bins_score_cnt[int(bin_cnt['_id'])] = bin_cnt['count']
            
            curr_range_scores = u_bins_score_cnt[i]
            should_resize = curr_range_scores > diff 

            u_prop = (diff / curr_range_scores) * .5 if should_resize else 1
            
            bins_score_cnt += u_bins_score_cnt * u_prop

            sample_func[i] = u_prop

        bins_hist.append(bins_score_cnt.copy())
    
    return sample_func, bins_score_cnt / bin_cap, bins_hist

In [None]:
greedy_sample_func, predicted_uniformity_arr, bins_hist = compute_sampling_func(.01)

In [None]:
WINDOW_SIZE = 5
bins = np.linspace(0, MAX_PP, num=NUM_BINS + 1, dtype=int)
x1 = np.arange(7000)

def roll(x, y, window_size):
    if window_size < 2 or len(x) != len(y):
        print("Invalid params")
        return None
    
    def moving_average(x, w):
        return np.convolve(x, np.ones(w), 'valid') / w
    
    shift = int(WINDOW_SIZE // 2)
    y1 = moving_average(y, window_size)
    x1 = x[shift : shift + len(y1)]

    return x1, y1

In [None]:
spline_fit = UnivariateSpline(x, y, s=.2)

In [None]:
p0 = (1.68881617e-03, 1.00011615e+00, 6.36903398e+00)

def get_linear_expon_fit(x, y):
    b, a = np.polyfit(x, np.log(y), 1, w = np.sqrt(y))
    return lambda x: np.exp(a + b * (x + WINDOW_SIZE))

def get_optimize_expon_fit(x, y, p0):
    def func(x, a, b, c):
        return a * np.power(b, (c * x))

    popt, pcov = curve_fit(func, x, y, p0 = p0)
    return lambda x: func(x, *popt)

rolling = [3, 5, 10, 15]
rolling_coords = [roll(bins[:-1], greedy_sample_func, r) for r in rolling]
optimize_fits = [get_optimize_expon_fit(x,y, p0) for x,y in rolling_coords]

for i, fit in enumerate(optimize_fits):
    plt.plot(fit(x1), linestyle = 'dashed', label = f'w-size: {rolling[i]}')

plt.legend(loc='upper left')

In [None]:
users = list(osu_random_db['osu_user_stats'].find({}, {'_id': 1, 'rank_score': 1}))
sampled_users = []

for user in users:
    if user['rank_score'] < 7000 and random() < ys[int(user['rank_score'])]:
        sampled_users.append(user)

sampled_user_ids = [u['_id'] for u in sampled_users]

sampled_scores = list(
    osu_random_db['osu_scores_high'].find({
        'user_id': {
            '$in': sampled_user_ids
        },
        'date': {
            '$gt': DATE_LIMIT
        }
    })
)

score_pp_greedy = [s['mlpp']['est_user_pp'] for s in sampled_scores]

In [None]:
score_pp_pdf = np.load("scores_pdf.npy")

In [None]:
def err_displaced_uniform(data):
    n = 50
    uni = np.full(n, len(data) / n)
    binned, _ = np.histogram(data, n)
    displacement = np.sum(np.abs(binned - uni)) / 2
    return displacement / len(data)

In [None]:
_ = plt.hist(score_pp_greedy, bins = 50, density = True, alpha=0.8, label='greedy')
_ = plt.hist(score_pp_pdf, bins = 50, density = True, alpha=0.8, label='pdf')
_ = plt.plot(np.arange(0,7000), np.full(7000, 1/7000))
plt.legend(loc='upper right')

In [None]:
np.save("greedy_sample_func.npy", sample_func)

In [None]:
%matplotlib notebook

def animate(i):
    plt.cla()
    plt.title("Greedy Sampling on decreasing PP ranges")
    plt.xlabel("Score est pp")
    plt.ylabel("Count included")
    plt.hist(bins, NUM_BINS, weights = bins_hist[i])
    plt.ylim([0,4000])
    high = int(MAX_PP * (1 - i / NUM_BINS))
    low = int(high - MAX_PP / NUM_BINS)
    plt.gca().annotate(f'range={low}-{high}', [5000, 3800])

fig = plt.figure()

ani = animation.FuncAnimation(fig, animate, frames=NUM_BINS, save_count=NUM_BINS, repeat = False)
# ani.save("greedy_sampling_progression.gif", fps = 30)