In [None]:
import sys
sys.path.append('../..')
import pandas as pd
from pymongo import UpdateOne
from pymongo import MongoClient
from tqdm import tqdm
import numpy as np
from exploration.config import mongo_inst
from mlpp.data_collection.sample import osuDumpSampler
import datetime
from datetime import datetime
import pprint
import matplotlib.pyplot as plt
import pickle 
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Retrievel

### Find users that have more than 50 data points & creates a new list of user ids

In [None]:
client = MongoClient(port=27017)
top_db = mongo_inst["osu_top_db"]
user_ids = list(map(lambda c: c['_id'], top_db['osu_user_stats'].find({}, {})))

In [None]:
big_user_ids = []
for i in user_ids:
    datapts = len(list(top_db.osu_track_updates.find({"user_id": i}, {"date":1})))
    if datapts > 50: #arbitrary number
        big_user_ids.append(i)


### Find the date & real_pp and then sort by date


In [None]:
real_pp = {}
for i in big_user_ids:
    real_pp[i] = list(top_db.osu_track_updates.find( {"user_id": i}, {"_id":0, "timestamp": 1, "pp_raw":1}))
    real_pp[i].sort(key = lambda x:x["timestamp"])

### Find the date and est_user_pp and then sort by date

In [None]:
est_pp = {}
for i in big_user_ids:
    est_pp[i] = list(top_db.osu_scores_high.find( {"user_id": i}, {"_id":0, "date": 1, "mlpp.est_user_pp":1}))
    est_pp[i].sort(key = lambda x:x["date"]) 

### Find the overlap time periods

In [None]:
time_comparison = {}
for i in big_user_ids:
    earliest_est_date = est_pp[i][0]["date"]
    earliest_real_date = real_pp[i][0]["timestamp"]
    latest_est_date = est_pp[i][-1]["date"]
    latest_real_date = real_pp[i][-1]["timestamp"]
    if earliest_est_date >= earliest_real_date:
        start = earliest_est_date
    else:
        start = earliest_real_date
    if latest_est_date <= latest_real_date:
        end = latest_est_date
    else:
        end = latest_real_date
    time_comparison[i] = {"start": start, "end": end}

### Clean the data for users who do not have time overlap between real and est

In [None]:
bad_users = []

for user in time_comparison:
    if time_comparison[user]["start"] >= time_comparison[user]["end"]:
        bad_users.append(user)

big_user_ids = [ele for ele in big_user_ids if ele not in bad_users]

for user in bad_users:
    del real_pp[user]
    del est_pp[user]
    del time_comparison[user]

### Create tables to store the data

In [None]:
def CREATE_REAL_USER_TABLE (one_user_id):
    real_start_time = []
    real_pp_points = []
    for i in real_pp[one_user_id]:
        real_start_time.append(i["timestamp"])
        real_pp_points.append(i["pp_raw"])
    real_table = np.column_stack((real_start_time, real_pp_points))
    return real_table

In [None]:
real_table_for_all = {}
for user in big_user_ids:
    real_table_for_all[user] = CREATE_REAL_USER_TABLE(user)

In [None]:
def CREATE_EST_USER_TABLE (one_user_id):
    est_start_time = []
    est_pp_points = []
    for i in est_pp[one_user_id]:
        est_start_time.append(i["date"])
        est_pp_points.append(i["mlpp"]["est_user_pp"])
    est_table = np.column_stack((est_start_time, est_pp_points))
    return est_table

In [None]:
est_table_for_all = {}
for user in big_user_ids:
    est_table_for_all[user] = CREATE_EST_USER_TABLE(user)

# Calculate error

In [None]:
def GET_REAL_NPOINTS(n, user):
    real_within = real_table_for_all[user][real_table_for_all[user][:,0] >= time_comparison[user]["start"]]
    real_within = real_within[real_within[:,0] <= time_comparison[user]["end"]]
    real_xp = real_within[:,0]
    real_fp = real_within[:,1]

    begin = real_xp[0] 
    end = real_xp[-1] 
    real_date_list = [] 
    delta = (end - begin)/n
    for i in range(1, n + 1): 
        real_date_list.append((begin+i*delta).timestamp())

    k  = 0
    for i in real_xp:
        real_xp[k] = i.timestamp()
        k+=1

    real_npoints = np.interp(real_date_list,list(real_xp),list(real_fp))
    return real_npoints

In [None]:
def GET_EST_NPOINTS(n, user):
    est_within = est_table_for_all[user][est_table_for_all[user][:,0] >= time_comparison[user]["start"]]
    est_within = est_within[est_within[:,0] <= time_comparison[user]["end"]]
    xp = est_within[:,0]
    fp = est_within[:,1]

    begin = xp[0] 
    end = xp[-1] 
    date_list = [] 
    delta = (end - begin)/n
    for i in range(1, n + 1): 
        date_list.append((begin+i*delta).timestamp())

    k  = 0
    for i in xp:
        xp[k] = i.timestamp()
        k+=1

    est_npoints = np.interp(date_list,list(xp),list(fp))
    return est_npoints

### MSE

In [None]:
def GET_MSE(n, user):
    real_points = GET_REAL_NPOINTS(n, user)
    est_points = GET_EST_NPOINTS(n, user)
    mse_for_one = (np.square(real_points - est_points)).mean()
    return mse_for_one

In [None]:
mse_for_all = {}

for user in big_user_ids:
    mse_for_all[user] = GET_MSE(50, user)

### Difference in area

In [None]:
def GET_AREA(n, user, intervals):
    #interval is an arbitrary number, it is the width of the small rectangles, we used 1000 before
    a = time_comparison[user]["start"].timestamp()
    b = time_comparison[user]["end"].timestamp()
    dx = (b - a) / intervals
    x_midpoint = np.linspace(dx / 2, b - dx / 2, intervals)
    total_area_between_curves = 0
    real_nPoints = GET_REAL_NPOINTS(n, user)
    est_nPoints = GET_EST_NPOINTS(n, user)
    for i in range(0, n):
        real_midpoint_riemann = real_nPoints[i] * dx
        est_midpoint_riemann = est_nPoints[i] * dx
        area_between_curves_one_point = abs(real_midpoint_riemann - est_midpoint_riemann)
        total_area_between_curves += area_between_curves_one_point
    total_area_between_curves = total_area_between_curves / 86400
    return total_area_between_curves

In [None]:
area_for_all = {}

for user in big_user_ids:
    area_for_all[user] = GET_AREA(10000, user, 1000)

### Dynamic Time Warping

In [None]:
def GET_DTW(user):
    real_within = real_table_for_all[user][real_table_for_all[user][:,0] >= time_comparison[user]["start"]]
    real_within = real_within[real_within[:,0] <= time_comparison[user]["end"]]
    est_within = est_table_for_all[user][est_table_for_all[user][:,0] >= time_comparison[user]["start"]]
    est_within = est_within[est_within[:,0] <= time_comparison[user]["end"]]
    distance = fastdtw(real_within[:,1], est_within[:,1], dist=euclidean)[0]
    return distance

In [None]:
dtw_for_all = {}

for user in big_user_ids:
    dtw_for_all[user] = GET_DTW(user)

### Merge errors calculated through three methods to one dataframe

In [None]:
error_df = pd.DataFrame({'mse_for_all':pd.Series(mse_for_all),'area_for_all':pd.Series(area_for_all), 'dtw_for_all':pd.Series(dtw_for_all)})
error_df