In [None]:
import numpy as np
from exploration.config import sql_inst, mongo_inst

In [None]:
 val_random_db = mongo_inst['val_random_db']
 val_dump = (val_random_db['osu_scores_high'], val_random_db['osu_user_stats'])

In [None]:
pdf_func = np.load("exploration/skill_biased_sampling_function/pdf_sample_func.npy")
greedy_func = np.load("exploration/skill_biased_sampling_function/greedy_sample_func.npy")

In [None]:
values = list(enumerate(_func))

In [None]:
with sql_inst('osu_random_2021_02') as conn:
    with conn.cursor() as cursor:
        cursor.execute(
            '''
            DROP TABLE IF EXISTS sample_func;
            CREATE TABLE sample_func  (user_pp INT PRIMARY KEY, probability FLOAT NOT NULL);
            ALTER TABLE sample_func AUTO_INCREMENT=100;
            '''
        )
    conn.commit()

    with conn.cursor() as cursor:
        cursor.executemany(
            '''
            INSERT INTO sample_func VALUES
            (%s, %s)
            '''
        , values)
    conn.commit()

In [None]:
from datetime import datetime
from mlpp.data_collection.sample_func import sampleFuncGenerator

with sql_inst('osu_random_2021_02') as conn:
    with conn.cursor() as cursor:
        cursor.execute(
            '''
            SELECT * FROM osu_user_stats
            WHERE rank_score < 7000 AND RAND() <= (
                SELECT probability FROM sample_func
                WHERE user_pp = FLOOR(rank_score)
                LIMIT 1
            )
            '''
        )

        sampled_users = [u[0] for u in cursor]
        print(sampled_users)

        sampled_scores = list(
            mongo_inst['val_random_db']['osu_scores_high'].find({
                'user_id': {
                    '$in': sampled_users
                },
                'date': {
                    '$gt': datetime(2019, 1, 1)
                }
            }, {'mlpp.est_user_pp': 1})
        )

        data = list(map(lambda s: s['mlpp']['est_user_pp'],sampled_scores))

        print(sampleFuncGenerator.prop_displaced(data))
    

In [None]:
NUM_BINS = 200
MAX_PP = 7000
DATE_LIMIT = datetime(2019,1,1)

generator = sampleFuncGenerator(date_limit = DATE_LIMIT, max_pp = MAX_PP, n_bins = NUM_BINS)

In [None]:
def simulate_fit(fit, dump = osu_dump):
    sc, _ = generator.simulate(*dump, fit)
    score_pp = list(map(lambda s: s['mlpp']['est_user_pp'], sc))
    return score_pp

In [None]:
sample = simulate_fit(greedy_func, val_dump)
cap = len(sample) / 50

In [None]:
len(sample)/ val_dump[0].count()

In [None]:
users_7k_up = [u['_id'] for u in val_dump[1].find({'rank_score': {'$gt': 7000}}, {'_id': 1})]

In [None]:
random_scores_pipeline = [
    {'$match': {
        'date': {'$gt': datetime(2019, 1, 1)},
        'user_id': {'$nin': users_7k_up}
    }},
    {'$sample': {'size': len(sample)}},
    {'$project': {'mlpp': {'est_user_pp': 1}}}
]

In [None]:
random_sample = [s['mlpp']['est_user_pp'] for s in val_dump[0].aggregate(random_scores_pipeline)]

In [None]:
sampleFuncGenerator.prop_displaced(sample)

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 2, figsize = (20, 8))
axs[0].hist(random_sample, bins = 50)
axs[0].plot([0, 7000], [cap, cap])
axs[0].set_title('Random 1% sample')
axs[0].annotate(f'Error: 40.6%', [5500, 5000], fontsize=20)
axs[0].set(xlabel = 'Score est user PP', ylabel='Count')
axs[1].hist(sample, bins = 50)
axs[1].plot([0, 7000], [cap, cap])
axs[1].annotate(f'Error: 12.4%', [5500, 1500], fontsize=20)
axs[1].set_title('Sampling function 1% sample')
axs[1].set(xlabel = 'Score est user PP', ylabel='Count')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=f93d0822-db5a-47ef-9a78-57b8adfbeb20' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>