In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('../..')
from pymongo import UpdateOne
from pymongo import MongoClient
from tqdm import tqdm
import numpy as np
from exploration.config import mongo_inst
from mlpp.data_collection.sample import osuDumpSampler
import datetime
from datetime import datetime
import pprint
import matplotlib.pyplot as plt
import pickle 
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
from scipy import stats

In [None]:
!pipenv install seaborn

In [None]:
import seaborn as sns

# Preparation

In [None]:
client = MongoClient(port=27017)
top_db = mongo_inst["osu_top_db"]
user_ids = list(map(lambda c: c['_id'], top_db['osu_user_stats'].find({}, {})))

big_user_ids = pickle.load(open("big_user_file.obj", "rb"))
time_comparison = pickle.load(open("overlap_time_big_user_file.obj", "rb"))

In [None]:
# Identify the users that don't have the time overlap between real pp and est pp
bad_users = []

for user in time_comparison:
    if time_comparison[user]["start"] >= time_comparison[user]["end"]:
        bad_users.append(user)
        
big_user_ids = [ele for ele in big_user_ids if ele not in bad_users]

In [None]:
# Load the dataframe of error comparison
df = pd.read_csv("error_df")
df.columns = ["user_id","mse","area","dtw"]
df["sqrt_mse"] = df['mse']
df["sqrt_mse"] = np.sqrt(df["sqrt_mse"])

In [None]:
# Get rank score
user_rank_score = {}
for i in big_user_ids:
    user_rank_score[i] = list(top_db.osu_user_stats.find({"_id": i}, {"_id": 0, "rank_score": 1}))[0]["rank_score"]
rank_score_df = pd.DataFrame({"user_id": pd.Series(user_rank_score.keys()),'rank_score':pd.Series(user_rank_score.values())})

# Calculate the means & z-scores 

In [None]:
#means of each column
col_mean = df[["mse","area","dtw", "sqrt_mse"]].mean()
col_mean

In [None]:
#normalize errors
cols = list(df.columns)
cols.remove("user_id")

for col in cols:
    col_zscore = col + '_zscore'
    df[col_zscore] = (df[col] - df[col].mean())/df[col].std(ddof=0)
df

# Advanced Five Number Summary

In [None]:
subset = list(df.columns)
subset.remove("user_id")
subset.remove("mse_zscore")
subset.remove("area_zscore")
subset.remove("dtw_zscore")
subset.remove("sqrt_mse_zscore")
df[subset].describe()

# Visualization

In [None]:
# add rank_score
df = df.merge(rank_score_df, on = "user_id")

In [None]:
# Density of zscores of the errors using three methods
fig, axes = plt.subplots(1, 3, figsize=(20,8))
sns.histplot(ax=axes[0], data=df, x="sqrt_mse_zscore", kde = True)
axes[0].set(xlim=(-1, 5))
axes[0].set(xlabel='Sqrt MSE Error Z-score')
axes[0].set(ylabel='Count')
axes[0].set(title='Histogram of MSE')
axes[0].axvline(0, ymax=0.9, color = "#ee6f57")

sns.histplot(ax=axes[1], data=df, x="area_zscore", kde = True)
axes[1].set(xlim=(-1, 5))
axes[1].set(xlabel='Area Error Z-score')
axes[1].set(ylabel='Count')
axes[1].set(title='Histogram of Area Error')
axes[1].axvline(0, ymax=0.9, color = "#ee6f57")

sns.histplot(ax=axes[2], data=df, x="dtw_zscore", kde=True)
axes[2].set(xlim=(-1, 5))
axes[2].set(xlabel='DTW Error Z-score')
axes[2].set(ylabel='Count')
axes[2].set(title='Histogram of DTW')
axes[2].axvline(0, ymax=0.9, color = "#ee6f57")

All three graphs are very skewed to the right, with a heavy majority of the z-scores resting near 0.0. The error between the real pp curve and the estimate pp curve is consistent, since many users have around the same amount of error. By looking at the graph of the first user, we can see there's a negative vertical shift before 2016-12 and a postivie vertical shift after that. The DTW error z-scores have a lower range compared to the others. 

Since there are a lot of errors that are negative, it indicates that a lot of the raw scores are below the mean error. There are outliers that are much greater than the mean error at the right tail.

In [None]:
# zscore errors by rank_score
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(15,6))

ax1.scatter(df['rank_score'], df['sqrt_mse_zscore'], marker = "x")
ax1.set_title("sqrt_mse_zscore vs rank_score")
ax1.set_xlabel('rank_score')
ax1.set_ylabel('sqrt_mse_zscore')
ax1.grid(True)

ax2.scatter(df['rank_score'], df['area_zscore'], marker = "x")
ax2.set_title('area_zscore vs rank_score')
ax2.set_xlabel('rank_score')
ax2.set_ylabel('area_zscore')
ax2.grid(True)

ax3.scatter(df['rank_score'], df['dtw_zscore'], marker = "x")
ax3.set_title('dtw_zscore vs rank_score')
ax3.set_xlabel('rank_score')
ax3.set_ylabel('dtw_zscore')
ax3.grid(True)

From the scattor plots, we can see there are much more data points at the lower end of the rank score. As the rank score increases, there are less data points. We can see the greatest outlier lies in the middle of the rank_score.

All three scatter plots show that the error between the real pp and estimate pp stays mostly consistent as you go up in rank, meaning as rank increases, the errors tend to stay the same. Rank doesn't seem to have an effect on the error. It seems that the outliers with large errors happen more often with users with low rank scores.