Wikidata entity completion - item / fr
===

In [None]:
from argparse import ArgumentParser
from gzip import GzipFile
import json
import pickle

import bokeh.io
import bokeh.plotting
from IPython.display import Markdown as md
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm_notebook

import relforge.cli.tf_autocomplete as ac
import relforge.tf_optimizer as opt

bokeh.io.output_notebook()

In [None]:
variant = 'item_fr'
with GzipFile('/home/ebernhardson/tf-ltr-data/model/model.{}.pkl.gz'.format(variant), 'rb') as f:
    min_report = pickle.load(f)
with GzipFile('/home/ebernhardson/tf-ltr-data/sensitivity/sensitivity.{}.pkl.gz'.format(variant), 'rb') as f:
    sens_report = pickle.load(f)
for report in min_report.evaluation_reports + [min_report.initial_report]:
    report.scores = {
        k: v if isinstance(v, opt.EvaluationScores) else opt.EvaluationScores(v) 
        for k, v in report.scores.items()}

In [None]:
from scipy.stats.kde import gaussian_kde
from collections import defaultdict

def ridge(bucket, data, scale):
    return list(zip([bucket]*len(data), scale*data))

def plot_distribution(title, buckets, data, colors):
    min_x = min(np.min(raw) for _, raw in data.values())
    max_x = max(np.max(raw) for _, raw in data.values())
    
    x = np.linspace(min_x, max_x, 500)
    # A bit evil .. but for the patch to draw the polygon we need
    # the data to start and end with y=0. The first and last
    # x values are repeated and these are applied manually later.
    x = np.append(np.append(x, x[-1])[::-1], x[0])[::-1]
    source = bokeh.models.ColumnDataSource(data=dict(x=x))
    p = bokeh.plotting.figure(
        y_range=sorted(buckets, reverse=True), title=title,
        plot_height=75 * len(buckets), plot_width=700,
        x_range=(min_x, max_x),
        toolbar_location=None)
    
    pdfs = {bucket: gaussian_kde(raw) for bucket, (_, raw) in data.items()}
    ys = {bucket: pdf(x) for bucket, pdf in pdfs.items()}
    max_y = max(np.max(ys[bucket]) for bucket in data.keys())
    scale = 0.8 / max_y
    
    bounds_data = defaultdict(list)
    for bucket, (bounds, raw) in sorted(data.items(), key=lambda x: x[0], reverse=True):
        # Apply polygon minimum edges
        ys[bucket][0] = 0
        ys[bucket][-1] = 0
        y = ridge(bucket, ys[bucket], scale=scale)
        source.add(y, bucket)
        p.patch(
            'x', bucket, color=colors[bucket], line_color="black",
            alpha=0.6, source=source)
        if bounds:
            bounds_data['buckets'].append(bucket)
            bounds_data['upper'].append(bounds[-1])
            bounds_data['lower'].append(bounds[0])
    if bounds_data:
        source_error = bokeh.models.ColumnDataSource(bounds_data)
        p.add_layout(bokeh.models.Whisker(
            dimension="width", line_color="black",
            source=source_error, base="buckets", upper="upper", lower="lower"))

    p.y_range.range_padding = 0.4
    bokeh.io.show(p)

def ci(values, rounds=5000, alpha=0.05, n=None, agg=lambda x: x.mean(axis=1)):
    if n is None:
        n = len(values)
    samples = np.random.choice(values, size=n * rounds, replace=True).reshape(rounds, -1)
    scores = np.sort(agg(samples))
    low = int(rounds * (alpha/2))
    mid = int(rounds / 2)
    high = int(rounds * (1 - alpha/2))
    return (scores[low], scores[mid], scores[high]), scores

def plot_ci(title, df, colors, extract, rounds=1000):
    data = {}
    buckets = df['bucket'].unique()
    for bucket in sorted(buckets):
        samples = extract(df[df['bucket'] == bucket])
        data[bucket] = ci(samples, rounds=rounds)
    plot_distribution(title, buckets, data, colors)
    return data

Metric used
===========

The metric used here to compare different tuning of the autocomplete algorithm represents the probablistic number of characters typed by a typical user before selecting their desired result from the autocomplete drop down. This metric first looks at real user sessions to estimate how likely a user is to continue typing even when their result is presented in the autocomplete, conditioned on the position the result is displayed at. Individual user sessions, represented in the data as (prefix_typed, page_id_clicked), are then simulated with prefixes from length 1 to the full prefix typed. From this simulation we determine the expected number of characters typed for an individual search clickthrough.

The dataset used contains 50k clickthroughs from oct 7 - dec 20 in the training set, and another 50k clicks from dec 20 - jan 7 in the test set.

The graph below shows bootstrapped probability densities for each bucket. Tick marks are shown at the 95% confidence levels.

In [None]:
import pandas as pd
import numpy as np
initial = min_report.initial_report['test'].scores
best = min_report.best_report['test'].scores
df = pd.DataFrame({
    'bucket': (['initial'] * len(initial)) + (['best'] * len(best)),
    'value': np.append(initial, best)
})
colors = {
    'initial': 'blue',
    'best': 'orange',
}
confidence = plot_ci('Mean characters typed probability density', df, colors, lambda x: x['value'])

In [None]:
nearby_reports = [r for r in min_report.evaluation_reports if r.scores['test'].mean < confidence['best'][0][-1]]

Mean characters typed by percentile
==============================
The following graphs show the before and after effects of tuning. This shows strong improvement, up to a full character, from percentiles 0-65. Tail queries show some decline in performance, but only slightly.

In [None]:
df = pd.DataFrame({
    'initial': min_report.initial_report['test'].percentiles,
    'best': min_report.best_report['test'].percentiles,
}).reset_index()
p = bokeh.plotting.figure()
p.line('initial', 'index', source=df, legend='initial score')
p.line('best', 'index', source=df, legend='best score', line_color='orange')
p.xaxis.axis_label = 'expected characters typed'
p.yaxis.axis_label = 'session percentile'
bokeh.plotting.show(p)

Per-session delta in expected characters typed
======================================

This shows on a per-session basis the change in number of characters typed between the baseline scoring and the scoring after parameter tuning. This suggests up to 20% of sessions save between 1 and 6 characters typed. Another 35% save a fractional character, and 20% have no impact. Around 20% of sessions are impacted negatively, future inspection of what makes these sessions different may be useful for investigating new scoring signals.

The orange line shows the training run with the best score. The faint blue lines show other training runs that have a mean score less than the upper 95% CI of the best score.

In [None]:
p = bokeh.plotting.figure(title='expected characters typed delta')
p.xaxis.axis_label = 'expected change in characters typed'
p.yaxis.axis_label = 'session percentile'
percentiles = np.arange(0, 101)
best_report = min_report.best_report
for report in nearby_reports:
    if report == best_report:
        continue
    score_delta = report.scores['test'].scores - min_report.initial_report['test'].scores
    score_delta_percentile = np.percentile(score_delta, percentiles)
    p.line('delta', 'percentile', source={'delta': score_delta_percentile, 'percentile': percentiles}, alpha=0.01)

score_delta = best_report.scores['test'].scores - min_report.initial_report['test'].scores
percentiles = np.arange(0, 101)
score_delta_percentile = np.percentile(score_delta, percentiles)
p.line('delta', 'percentile', 
       source={'delta': score_delta_percentile, 'percentile': percentiles},
       color='orange', line_width=2, legend='best score')

bokeh.plotting.show(p)


Final Tuned Values
================

In [None]:
for k, v in sorted(min_report.summary['best_report']['variables'].items(), key=lambda x: x[0]):
    if 'statement_keywords' in k:
        # not worth tuning, these are simply deboosts that should be strong enough to do the job
        continue
    print('{score:10.2f} : {name}'.format(score=v, name=k))

In [None]:
best_mean = min_report.best_report['test'].mean
top_reports = [x for x in min_report.evaluation_reports if x['test'].mean <= confidence['best'][0][-1]]
# [x['test'].mean for x in top_reports]

In [None]:
df = pd.DataFrame(dict({'score': [x['test'].mean for x in min_report.evaluation_reports]}, **{
    var_name: [x.variables[var_name] for x in min_report.evaluation_reports]
    for var_name in min_report.best_report.variables.keys()
}))

Sensitivity of chosen parameters
===========================

To get an idea of how much influence individual parameters have on the final score, and to estimate how sensitive those variables are to small changes, the graphs below plot the sensitivity of individual parameters. This is performed by holding all variables except one as a constant, and sweeping a set of values around the chosen point. The graphs then show how the final score changes based on changes to that variable. Dots on the graphs are additionally colored by their score. A graph of a single color suggests the variable in question has a relatively small influence on the final output.

In [None]:
import bokeh.models
from bokeh.palettes import Viridis256

high = df['score'].mean() + df['score'].std()
cmap = bokeh.models.LinearColorMapper(palette=Viridis256, low=np.min(df['score']), high=high)
plots = []
for var_name, reports in sorted(sens_report.variable_reports.items(), key=lambda x: x[0]):
    tested_values = [r.variables[var_name] for r in reports]
    scores = [r['test'].mean for r in reports]

    p = bokeh.plotting.figure(title=var_name, height=250)
    p.circle(x='value', y='score', size=8, alpha=0.5,
             fill_color={'field': 'score', 'transform': cmap},
             source={'value': tested_values, 'score': scores})
    plots.append([p])
grid = bokeh.layouts.gridplot(plots)
bokeh.plotting.show(grid)
