In [1]:
# don't use at the same time with the server running
# https://stackoverflow.com/questions/59119396/how-to-use-django-3-0-orm-in-a-jupyter-notebook-without-triggering-the-async-con
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import json
import pickle
import numpy as np
from django.contrib.auth.models import User as DjangoUser
from backend.models import UserPreferences, Video, VideoRating
from matplotlib import pyplot as plt
from backend.rating_fields import VIDEO_FIELDS
from tqdm.auto import tqdm
from IPython.display import HTML, display
import pandas as pd

# Loading results from `ray`

In [2]:
results_dir = '/root/ray_results_softplus_loss/featureless_tournesol_lambda'
fn_pickle = results_dir + '/results.pkl'
usernames_to_export = ['__aggregate_expert__', 'le_science4all', 'aidjango', 'sergei']

In [3]:
def load_results(results_dir, usernames_set=None):
    """Load the results from all experiments in a directory."""
    exps = list(filter(lambda x: x.startswith('experiment_'), sorted(os.listdir(results_dir))))
    result = {}
    
    if usernames_set is None:
        usernames_set = set()
    usernames_set = set(usernames_set)
    
    id_to_username = {x.id: x.user.username for x in UserPreferences.objects.all()
                      if x.user.username in usernames_set}

    for exp in tqdm(exps):
        params_path = os.path.join(results_dir, exp, 'params.json')

        if not os.path.isfile(params_path):
            continue

        with open(params_path, 'r') as f:
            params = json.loads(f.read())

        ckpt_path = os.path.join(results_dir, exp, 'checkpoint_50000', 'learner_ckpt.pkl')

        if not os.path.isfile(ckpt_path):
            continue

        with open(ckpt_path, 'rb') as f:
            ckpt = pickle.load(f)

        all_ratings = ckpt['aggregator']['ratings']
        idx = all_ratings['layer']['idx']
        data = all_ratings['data']
        objects = all_ratings['objects']
        features = all_ratings['features']
        experts = all_ratings['experts']

        common_expert = experts[-1]
        common_expert_id = len(experts) - 1

        def get_video_scores(video_id, expert):
            video_id = objects.index(video_id)
            video_scores = []
            expert_id = experts.index(expert)
            for feature_id, feature in enumerate(features):
                try:
                    idx_v = idx.get_key((expert_id, video_id, feature_id))
                    score_v = data[idx_v]
                except KeyError:
                    score_v = None
                video_scores.append(score_v)
            return np.array(video_scores, dtype=np.float32)

        result[exp] = {'params': params,
                       'objects': objects,
                       'features': features,
                       'experts': experts}
        
        
        experts_export = []
        for i, expert in enumerate(tqdm(experts, leave=False)):
            if (expert not in id_to_username) and (expert != common_expert):
                continue
            expert_name = id_to_username.get(expert, expert)
            scores = np.array([get_video_scores(vid, expert=expert) for vid in objects])
            result[exp]['scores:' + str(expert_name)] = scores
            experts_export.append(str(expert_name))
            
        result[exp]['experts_export'] = experts_export
        
        
    return result

In [7]:
# computing results and saving the pickle file
results = load_results(results_dir, usernames_set=usernames_to_export)
with open(fn_pickle, 'wb') as f:
    pickle.dump(results, f)

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2104.0), HTML(value='')))




In [4]:
with open(fn_pickle, 'rb') as f:
    results = pickle.load(f)

# Analyze via top-5 and bottom-5 tables for hyperparameters

In [9]:
def write_result_to_html(result, top_bot_n=5):
    """Get processed results into .html for lambda and mu."""

    all_params = []
    names = None

    video_to_info = {v.video_id: {'name': v.name, 'uploader': v.uploader}
                     for v in Video.objects.all()}
    
    out_files = []

    all_objects = [x['objects'] for x in result.values()]
    assert all([all_objects[0] == o for o in all_objects])
    all_features = [x['features'] for x in result.values()]
    assert all([all_features[0] == o for o in all_features])
    #all_experts = [x['experts'] for x in result.values()]
    #assert all([all_experts[0] == o for o in all_experts])

    for f in tqdm(VIDEO_FIELDS):
    # f = VIDEO_FIELDS[0]

        for username in list(result.values())[0]['experts_export']:
            
            mus = set()
            lambdas = set()

            df_by_param = {}
        
            for res in result.values():

                params = res['params']
                scores = res[f'scores:{username}']
                all_params.append(params)

                lam = params['_gin__lam__grid_search']
                mu = params['_gin__mu__grid_search']

                param_desc = f"lam={lam} mu={mu}"

                mus.add(mu)
                lambdas.add(lam)

                df = pd.DataFrame(scores, columns=all_features[0], index=[x for x in all_objects[0]])
                
                def get_title(vid):
                    if vid not in video_to_info or video_to_info[vid]['uploader'] is None:
                        return None, None
                    else:
                        return video_to_info[vid]['uploader'], video_to_info[vid]['name']
                
                names = []
                uploaders = []
                for vid in df.index:
                    uploader, name = get_title(vid)
                    names.append(name)
                    uploaders.append(uploader)
                df['name'] = names
                df['uploader'] = uploaders
                df['video_id'] = list(df.index)
                # print(df['name'])
                #df.index = [x for x in df.index]

                df = df[~pd.isna(df[f])]
                
                df_top = df.sort_values(f, ascending=False)[:top_bot_n]
                df_bot = df.sort_values(f, ascending=False)[-top_bot_n:]
                top_bot = pd.concat([df_top, pd.DataFrame([{'video_id': "",
                                                            'name': "",
                                                            'uploader': ""}], columns=df_top.columns), df_bot])
#                 df_top_bot = top_bot[['name', f]]
                df_top_bot = top_bot
                df_by_param[(lam, mu)] = df_top_bot
                
#                 df_top_bot.index = [x[:5] for x in df_top_bot.index]
#                 df_top_bot[f] = [f'' for x in df_top_bot[f]]

            html = ''
            html += f'<h3>username: {username} feature={f}</h3><hr />'
            html += f"<p>runs: {len(result)} first run: {list(result.keys())[0]}</p>"
            html += f"<p>directory: {results_dir}</p>"
            html += '<table>'

            html += '<tr><th></th>'
            for mu in sorted(mus):
                html += f'<th>mu={mu}</th>'
            html += '</tr>'

            for lam in sorted(lambdas):
                html += "<tr>"

                html += f"<th>lam={lam}</th>"

                for mu in sorted(mus):
                    key = (lam, mu)
                    if key not in df_by_param:
                        html += "<td>--</td>"
                    else:
                        
                        df_html = f'<table border="1"><tr><th>vid id</th><th>Uploader</th><th>Video title</th><th>{f}</th></tr>'
                        
                        for i, (_, row) in enumerate(df_by_param[key].iterrows()):
                            if i < top_bot_n:
                                color = 'green'
                            elif i == top_bot_n:
                                color = ''
                            else:
                                color = 'red'
                            
                            df_html += f'<tr>'
                            df_html += f'<td><a href="https://www.youtube.com/watch?v={row.video_id}">{row.video_id[:5]}</a></td>'
                            df_html += f'<td>{row.uploader}</td>'
                            df_html += f'<td>{row["name"][:20]}</td>'
                            df_html += f'<td><font color="{color}">{row[f]:.3e}</font></td>'
                            df_html += '</tr>'
                            
#                             print(row.name)
                        
                        df_html += '</table>'
                        
                        # custom pandas html
                        html += f"<td>{df_html}</td>"
                        
#                         html += f"<td>{df_by_param[key].to_html()}</td>"

                html += "</tr>"

            html += '</table>'

            fn = f'results_{f}_{username}.html'

            with open(fn, 'w') as f_out:
                f_out.write(html)

            out_files.append(fn)
        
    return out_files

In [117]:
write_result_to_html(results)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




['results_largely_recommended_sergei.html',
 'results_largely_recommended_le_science4all.html',
 'results_largely_recommended_aidjango.html',
 'results_largely_recommended___aggregate_expert__.html',
 'results_reliability_sergei.html',
 'results_reliability_le_science4all.html',
 'results_reliability_aidjango.html',
 'results_reliability___aggregate_expert__.html',
 'results_importance_sergei.html',
 'results_importance_le_science4all.html',
 'results_importance_aidjango.html',
 'results_importance___aggregate_expert__.html',
 'results_engaging_sergei.html',
 'results_engaging_le_science4all.html',
 'results_engaging_aidjango.html',
 'results_engaging___aggregate_expert__.html',
 'results_pedagogy_sergei.html',
 'results_pedagogy_le_science4all.html',
 'results_pedagogy_aidjango.html',
 'results_pedagogy___aggregate_expert__.html',
 'results_layman_friendly_sergei.html',
 'results_layman_friendly_le_science4all.html',
 'results_layman_friendly_aidjango.html',
 'results_layman_friendly_

# Analyzing via reference point videos

In [5]:
import plotly.io as pio
import plotly.express as px

pio.orca.config.use_xvfb = True
pio.orca.config.executable = '/root/miniconda3/bin/orca'

pio.orca.config.save() 

In [6]:
def scores_as_dict(result, hparam_run, feature, username):
    """Get scores as a dictionary video_id->score."""
    scores = result[hparam_run]
    all_features = scores['features']
    all_objects = scores['objects']
    scores_user = scores[f'scores:{username}']
    df = pd.DataFrame(scores_user, columns=all_features, index=all_objects)
    df = df[~pd.isna(df[feature])]
    df = df[[feature]]
    _scores_as_dict = dict(df[feature])
    return _scores_as_dict

def describe_hparams(hparam_run_dct):
    """Get a short string description of hyperparameters."""
    params = hparam_run_dct['params']
    # only keeping hypers
    params = {x: y for x, y in params.items() if x.startswith('_gin')}
    
    def gin_short_name(param_name_gin):
        assert param_name_gin.startswith('_gin')
        return param_name_gin.split('__')[1]
    
    def round_number(param_val):
        return ("%.01f" % param_val).zfill(4)
    
    params_dct = {gin_short_name(x): round_number(y) for x, y in params.items()}
    keys_sorted = sorted(params_dct.keys())#[::-1]
    
    out_list = [f'{param}={params_dct[param]}' for param in keys_sorted]
    return '/'.join(out_list)

def get_ranking(score_dct):
    """Transform a score dictionary into a ranking (0=best, 1=worst)."""
    # list of video ids
    video_ids_list = sorted(list(score_dct.keys()))

    # list of video scores in the same order
    video_scores = [score_dct[vid] for vid in video_ids_list]

    # best ID in the list to worst ID in the list
    id_best_to_worst = np.argsort(video_scores)[::-1]

    # quantiles to write
    quantiles_f = np.linspace(0.0, 1.0, len(video_ids_list))

    ranking = {video_ids_list[id_best_to_worst[i]]: quantiles_f[i] for i in range(len(video_ids_list))}
    
    return ranking

def filter_selected_videos(ranking_by_hparam, selected_ids):
    """Only keep the videos from a list."""
    
    if selected_ids is None:
        return ranking_by_hparam
    
    return {hp: {vid: rank for vid, rank in ranks.items() if vid in selected_ids}
            for hp, ranks in ranking_by_hparam.items()}

In [7]:
def get_ranking_by_hparam_df(result, feature, username, videos_output):
    """Get the dataframe videos x ranks in different hparams."""

    video_to_info = {v.video_id: {'name': v.name, 'uploader': v.uploader}
                     for v in Video.objects.all()}

    usernames_export = list(result.values())[0]['experts_export']

    all_objects = [x['objects'] for x in result.values()]
    assert all([all_objects[0] == o for o in all_objects])
    all_features = [x['features'] for x in result.values()]
    assert all([all_features[0] == o for o in all_features])
    
    scores_by_hparam = {describe_hparams(result[x]): scores_as_dict(result, x, feature, username)
                    for x in result.keys()}
    
    video_ids = [list(x.keys()) for x in scores_by_hparam.values()]

    sets = [set(x) for x in video_ids]
    sets_len = [len(t) for t in sets]
    lsts_len = [len(t) for t in video_ids]

    # checking that have scores for the same videos
    assert [sets[0] == s for s in sets]

    # checking that there are no duplicates
    assert [set_len == lst_len for set_len, lst_len in zip(sets_len, lsts_len)]

    ranking_by_hparam = {hp: get_ranking(score) for hp, score in scores_by_hparam.items()}

    df_ranking_parallel_coords = pd.DataFrame(filter_selected_videos(ranking_by_hparam, videos_output))
    
    return df_ranking_parallel_coords

In [8]:
def plot_ranking_px(results, feature, username, videos_output):
    """Plot the results in parallel coordinates."""

    videos_descr = f'{len(videos_output)}vid' if videos_output is not None else 'all-vid'
    fn_out = f"results_pxpc_{username}_{feature}_{videos_descr}.pdf"
    
    if os.path.isfile(fn_out):
        print(fn_out, 'already exists')
        return
    
    df = get_ranking_by_hparam_df(results, feature, username, videos_output)#=None)
    df = -df

    hparam_columns = [x for x in df.columns if x != 'color']
    df_best = pd.DataFrame({x: 0.0 for x in hparam_columns}, index=['best'])
    df_worst = pd.DataFrame({x: -1.0 for x in hparam_columns}, index=['worst'])

    df = pd.concat([df_best, df, df_worst])

    df['color'] = np.linspace(1, len(df), len(df))

    color_scheme = px.colors.sequential.Turbo

    fig = px.parallel_coordinates(df, color="color",
                                  dimensions=sorted(hparam_columns),
                                  color_continuous_scale=color_scheme,
                                  width=1200, height=500
    )

    # fig.update(layout_coloraxis_showscale=False)


    fig.update_layout(coloraxis_colorbar=dict(
        title=f"Videos by {feature}",
        tickvals=np.arange(1, len(df) + 1)[1:-1],
        ticktext=df.index[1:-1],
        lenmode="pixels", len=400,
    ))
    
    
    fig.write_image(fn_out)
    
    return fig

In [9]:
feature = VIDEO_FIELDS[4]
username = [1]
# list of videos to display
videos_output = "EhAemz1v7dQ I5-dI74zxPg lG4VkPoG3ko rStL7niR7gs gPHgRp70H8o".split()
print(feature, username)

pedagogy [1]


In [10]:
# writing all figures...
for feature in tqdm(VIDEO_FIELDS, desc="feature", leave=False):
    for username in tqdm(list(results.values())[0]['experts_export'], desc="username", leave=False):
        for vids in tqdm([videos_output, None], desc="type", leave=False):
            plot_ranking_px(results, feature, username, videos_output=vids)

HBox(children=(FloatProgress(value=0.0, description='feature', max=10.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_largely_recommended_5vid.pdf already exists
results_pxpc_sergei_largely_recommended_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_largely_recommended_5vid.pdf already exists
results_pxpc_le_science4all_largely_recommended_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_largely_recommended_5vid.pdf already exists
results_pxpc_aidjango_largely_recommended_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___largely_recommended_5vid.pdf already exists
results_pxpc___aggregate_expert___largely_recommended_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_reliability_5vid.pdf already exists
results_pxpc_sergei_reliability_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_reliability_5vid.pdf already exists
results_pxpc_le_science4all_reliability_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_reliability_5vid.pdf already exists
results_pxpc_aidjango_reliability_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___reliability_5vid.pdf already exists
results_pxpc___aggregate_expert___reliability_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_importance_5vid.pdf already exists
results_pxpc_sergei_importance_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_importance_5vid.pdf already exists
results_pxpc_le_science4all_importance_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_importance_5vid.pdf already exists
results_pxpc_aidjango_importance_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___importance_5vid.pdf already exists
results_pxpc___aggregate_expert___importance_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_engaging_5vid.pdf already exists
results_pxpc_sergei_engaging_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_engaging_5vid.pdf already exists
results_pxpc_le_science4all_engaging_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_engaging_5vid.pdf already exists
results_pxpc_aidjango_engaging_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___engaging_5vid.pdf already exists
results_pxpc___aggregate_expert___engaging_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_pedagogy_5vid.pdf already exists
results_pxpc_sergei_pedagogy_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_pedagogy_5vid.pdf already exists
results_pxpc_le_science4all_pedagogy_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_pedagogy_5vid.pdf already exists
results_pxpc_aidjango_pedagogy_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___pedagogy_5vid.pdf already exists
results_pxpc___aggregate_expert___pedagogy_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_sergei_layman_friendly_5vid.pdf already exists
results_pxpc_sergei_layman_friendly_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_le_science4all_layman_friendly_5vid.pdf already exists
results_pxpc_le_science4all_layman_friendly_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc_aidjango_layman_friendly_5vid.pdf already exists
results_pxpc_aidjango_layman_friendly_all-vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

results_pxpc___aggregate_expert___layman_friendly_5vid.pdf already exists


HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='username', max=4.0, style=ProgressStyle(description_width…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

HBox(children=(FloatProgress(value=0.0, description='type', max=2.0, style=ProgressStyle(description_width='in…

NameError: name 'df' is not defined