# Naive Sampling function - Sample with uniform estimate user pp scores
**Contributors:** Victor Lin

In [None]:
import sys
sys.path.append('../..')
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from exploration.config import mongo_inst
from mlpp.data_collection.sample import use_random_sample, get_custom_user_ids
from mlpp.data_collection.sample_func import sampleFuncGenerator
from mlpp.data_collection.distributions import best_fit_distribution

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [None]:
osu_random_db = mongo_inst['osu_random_db']

osu_dump = (osu_random_db['osu_scores_high'], osu_random_db['osu_user_stats'])
osu_scores_high, osu_user_stats = osu_dump

pdf_dump = (osu_random_db['scores_sample_3k'], osu_random_db['users_sample_3k'])
pdf_scores_sample, pdf_users_sample = pdf_dump

DATE_LIMIT = datetime(2019,1,1)

generator = sampleFuncGenerator(date_limit = DATE_LIMIT)

In [None]:
user_ids = use_random_sample(*osu_dump, *pdf_dump, 3000)

In [None]:
scores = list(pdf_scores_sample.find({'date': {'$gt': DATE_LIMIT}}, {'mlpp': 1, '_id': 0}))

pp_data_raw = [s['mlpp']['est_user_raw_pp'] for s in scores]
pp_data = [s['mlpp']['est_user_pp'] for s in scores]

In [None]:
fig, axs = plt.subplots(1, 2)
fig.set_figwidth(15)
_ = axs[0].hist(pp_data_raw, bins = 200)
_ = axs[1].hist(pp_data, bins = 200)

In [None]:
# best_dist, best_params = best_fit_distribution(pp_data)

best_dist = st.recipinvgauss
best_params = best_dist.fit(pp_data)

In [None]:
arg = best_params[:-2]
loc = best_params[-2]
scale = best_params[-1]
pdf = lambda i: best_dist.pdf(i, loc=loc, scale=scale, *arg)

In [None]:
plt.figure(figsize=(10,6))

est_pp_pdf = best_dist.pdf(np.arange(1, 7000), loc=loc, scale=scale, *arg)

_ = plt.hist(pp_data, bins = 200, density=True)
_ = plt.plot(est_pp_pdf)

In [None]:
SAMPLE_PROPORTIONS = np.asarray([.01, .02, .05, .1])
pcnts = [int(prop * 100) for prop in SAMPLE_PROPORTIONS]

sample_funcs = [generator.pdf(pdf_scores_sample, st.recipinvgauss, prop) for prop in SAMPLE_PROPORTIONS]

In [None]:
for i, f in enumerate(sample_funcs):
    plt.plot(f, label = f'{pcnts[i]}%')

_ = plt.legend(loc='upper left')

In [None]:
def test_pcnt_sampled(sample_func):
    sampled_users = get_custom_user_ids(osu_user_stats, sample_func)

    sampled_scores = osu_scores_high.find({
            'user_id': {
                '$in': sampled_users
            },
            'date': {
                '$gt': DATE_LIMIT
            }
    }, {'mlpp.est_user_pp': 1})

    return sampled_scores.count() / osu_scores_high.count()

In [None]:
from tqdm import tqdm
pcnt_1_avg_of_expected = sum(test_pcnt_sampled(sample_funcs[0]) / .01 for i in tqdm(range(10))) / 10
pcnt_2_avg_of_expected = sum(test_pcnt_sampled(sample_funcs[1]) / .02 for i in tqdm(range(10))) / 10

print(f'\n\nProportion of expected 1%: {pcnt_1_avg_of_expected:.2f}')
print(f'Proportion of expected 2%: {pcnt_2_avg_of_expected:.2f}')

In [None]:
PROP_BONUS_FACTOR = 1 / pcnt_1_avg_of_expected
SAMPLE_PROPORTIONS *= PROP_BONUS_FACTOR

sample_funcs = [sampleFuncGenerator().pdf(pdf_scores_sample, st.recipinvgauss, prop) for prop in SAMPLE_PROPORTIONS]

In [None]:
samples = []

for i, f in enumerate(sample_funcs):
    samples.append(generator.test_sample_func(*osu_dump, sample_funcs[i]))

    scores = samples[-1][0]
    pcnt_scores = 100 * len(scores) / osu_scores_high.count()
    print(f"{pcnts[i]}% Sampling: {pcnt_scores:.2f}% sampled")

score_pp = [[s['mlpp']['est_user_pp'] for s in sc] for sc, u in samples]

In [None]:
fig, axs = plt.subplots(4, figsize=(6, 18))

for i in range(len(SAMPLE_PROPORTIONS)):
    ax = axs[i]
    ax.hist(score_pp[i], bins = 50, label = f'{pcnts[i]}%', density = True)
    ax.plot([0, 7000], [1/7000, 1/7000])
    ax.set(xlabel = "Proportion", ylabel="Score est pp")
    ax.set_title(f'{pcnts[i]}% Sample')

_ = plt.tight_layout()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f93d0822-db5a-47ef-9a78-57b8adfbeb20' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>