# Naive Sampling function - Sample with uniform estimate user pp scores
**Contributors:** Victor Lin

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../..')
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from exploration.config import mongo_inst
from mlpp.data_collection.sample import osuDumpSampler
from random import random

In [None]:
osu_random_db = mongo_inst['osu_random_db']
sampler = osuDumpSampler(osu_random_db)

SAMPLE_NAME = 'sample_scores_3k'
DATE_LIMIT = datetime(2019,1,1)

In [None]:
user_ids = sampler.use_random_sample(SAMPLE_NAME, 3000)

In [None]:
scores = list(osu_random_db[SAMPLE_NAME].find({'date': {'$gt': DATE_LIMIT}}, {'mlpp': 1, '_id': 0}))

pp_data_raw = [s['mlpp']['est_user_raw_pp'] for s in scores]
pp_data = [s['mlpp']['est_user_pp'] for s in scores]

In [None]:
fig, axs = plt.subplots(1, 2)
fig.set_figwidth(15)
_ = axs[0].hist(pp_data_raw, bins = 200)
_ = axs[1].hist(pp_data, bins = 200)

In [None]:
best_params = st.recipinvgauss.fit(pp_data, 200)
arg = best_params[:-2]
loc = best_params[-2]
scale = best_params[-1]

In [None]:
est_pp_pdf = st.recipinvgauss.pdf(np.arange(1, 7000), loc=loc, scale=scale, *arg)
plt.figure(figsize=(12,8))
_ = plt.hist(pp_data, bins = 200, density=True)
_ = plt.plot(np.arange(0,7000), np.full(7000, 1/7000))
# plt.plot(est_pp_pdf)

In [None]:
def pdf_proportion(t):
    total = 0
    for i in range(1, 7001):
        p = st.recipinvgauss.pdf(i, loc=loc, scale=scale, *arg)
        if p > t:
            total += t
        else:
            total += p
    return total

In [None]:
def pdf_sample(t):
    func = []
    for i in range(1, 7001):
        p = st.recipinvgauss.pdf(i, loc=loc, scale=scale, *arg)
        if p > t:
            func.append(t/p)
        else:
            func.append(1)
    return func

In [None]:
sample_func = pdf_sample(.000002)
pdf_proportion(.000002)

In [None]:
users = list(osu_random_db['osu_user_stats'].find({}, {'_id': 1, 'rank_score': 1}))
sampled_users = []
for user in users:
    if user['rank_score'] < 7000 and random() < sample_func[int(user['rank_score'])]:
        sampled_users.append(user)

In [None]:
sampled_user_ids = [u['_id'] for u in sampled_users]

sampled_scores = list(
    osu_random_db['osu_scores_high'].find({
        'user_id': {
            '$in': sampled_user_ids
        },
        'date': {
            '$gt': DATE_LIMIT
        }
    })
)

In [None]:
score_pp = [s['mlpp']['est_user_pp'] for s in sampled_scores]

In [None]:
np.save("scores_pdf.npy", score_pp)

In [None]:
_ = plt.hist(score_pp, bins = 200, density = True)