In [None]:
# don't use at the same time with the server running
# https://stackoverflow.com/questions/59119396/how-to-use-django-3-0-orm-in-a-jupyter-notebook-without-triggering-the-async-con
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

from backend.models import UserPreferences, VideoRating
from backend.rating_fields import VIDEO_FIELDS
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import display
import datetime
import statsmodels.formula.api as sm

In [None]:
df = pd.DataFrame(list(UserPreferences.objects.all().annotate(n_ratings=Count('expertrating')).values('n_ratings', 'user__username')))
df.sort_values('n_ratings', ascending=False)

In [None]:
username = 'lenhoang'
ratings = VideoRating.objects.filter(user__user__username=username).values(*VIDEO_FIELDS, 'video__video_id',
                                                                           'video__views', 'video__uploader',
                                                                           'video__language', 'video__duration',
                                                                           'video__publication_date',
                                                                           'video__name', 'video__description')

In [None]:
df = pd.DataFrame(list(ratings))

In [None]:
np.unique(df.diversity_inclusion), np.unique(df.layman_friendly)

In [None]:
del df['diversity_inclusion']
del df['layman_friendly']

In [None]:
VF_ACTIVE = set(VIDEO_FIELDS).intersection(df.columns)

In [None]:
?df.describe

In [None]:
df.describe(include='all')

In [None]:
df_by_uploader = df.groupby('video__uploader').agg('mean')

In [None]:
for f in VF_ACTIVE:
    print(f)
    display(df_by_uploader.sort_values(f, ascending=False))

In [None]:
g = sns.PairGrid(df[VF_ACTIVE], diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot)

In [None]:
langs = sorted(np.unique(list(map(str, df.video__language))))
df['video__language__id'] = [langs.index(str(x)) for x in df.video__language]

In [None]:
df['video__duration__s'] = [z.total_seconds() for z in df.video__duration]
df['video__duration__s__log'] = np.log10(df['video__duration__s'])

In [None]:
df['video__publication_date__days_ago'] = [(datetime.datetime.now().date() - z).days if z else 1 for z in df['video__publication_date']]
df['video__publication_date__days_ago__log'] = np.log10(df['video__publication_date__days_ago'])

In [None]:
df['video__views__log'] = np.log10(df['video__views'])

In [None]:
x_vars = ['video__views__log',
          'video__language__id',
          'video__duration__s__log',
          'video__publication_date__days_ago__log',]
y_vars = VF_ACTIVE

In [None]:
g = sns.PairGrid(df, x_vars=x_vars, y_vars=y_vars)
g.map(sns.kdeplot)
# g.map(sns.scatterplot)
# g.map_diag(sns.kdeplot)

In [None]:
list(zip(range(len(langs)), langs))

In [None]:
for f in VF_ACTIVE:
    print(f)
    r = sm.ols(formula=f'{f} ~ video__views__log + video__language__id +'
           'video__duration__s__log + video__publication_date__days_ago__log', data=df).fit()
    display(r.summary())

In [None]:
for f in VF_ACTIVE:
    print(f)
    others = '+'.join(set(VF_ACTIVE).difference([f]))
    r = sm.ols(formula=f'{f} ~ {others}', data=df).fit()
    display(r.summary())

In [None]:
sns.heatmap(df[VF_ACTIVE].corr())