In [None]:
import sys
sys.path.append('../implementation/')
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from tqdm import tqdm
import time
from monadjemi_competing_models import CompetingModels
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading the STL Crimes underlying data and user interaction data
underlying_data = pd.read_csv('../data/stl_crimes/dots.csv')
underlying_data.set_index('id', drop=True, inplace=True)
output_file_path = '../output/stl/stl_map_results_competing_models.pkl'
ks = [1, 5, 10, 20, 50, 100]
interaction_data = pd.read_csv('../data/stl_crimes/stl_combined_interactions.csv')
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)
interaction_data['interaction_type_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_type_session), axis=1)

In [None]:
interaction_index = 35

competing_models = CompetingModels(underlying_data, [['x','y']], ['type'])
ks = [1, 5, 10, 20, 50, 100]
predicted = pd.DataFrame()
for i in tqdm(range(len(interaction_data.iloc[interaction_index].interaction_session))):
    interaction = interaction_data.iloc[interaction_index].interaction_session[i]
    competing_models.update(interaction)
    
    if i < len(interaction_data.iloc[interaction_index].interaction_session) - 1:
        probability_of_next_point = competing_models.predict()
        next_point = interaction_data.iloc[interaction_index].interaction_session[i+1]
        predicted_next_dict = {}
        for k in ks:
            predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
        predicted = predicted.append(predicted_next_dict, ignore_index=True)
    

In [None]:
competing_models.get_model_posterior().plot(title=f'{interaction_data.iloc[interaction_index].task}', alpha=0.5, lw=4)

In [None]:
competing_models.get_attribute_bias().plot(title=f'{interaction_data.iloc[interaction_index].task}', alpha=0.5, lw=4)

In [None]:
# success rate for predicting next click in the top-k
ncp = predicted.sum()/len(predicted)
ncp

In [None]:
# This blocks runs competing models on all sessions and records the outputs
# The results are saved to a file, so only need to run this block if you need new results

stl_map_results = pd.DataFrame()
for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user} task {row.task}')
    results = {'participant_id': row.user, 'task': row.task}
    competing_models = CompetingModels(underlying_data, [['x','y']], ['type'])
    predicted = pd.DataFrame()
    rank_predicted = []

    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        competing_models.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = competing_models.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
            
    ncp = predicted.sum()/len(predicted)
    results['rank'] = rank_predicted

    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
        
    bias = competing_models.get_attribute_bias()
    for col in bias.columns:
        results[f'bias-{col}'] = bias[col].to_numpy()
        
    posterior = competing_models.get_model_posterior()
    for col in posterior.columns:
        results[f'posterior-{col}'] = posterior[col].to_numpy()
    
    stl_map_results = stl_map_results.append(results, ignore_index=True)
    
stl_map_results.to_pickle(output_file_path)

In [None]:
# Load results if already saved

stl_map_results = pd.read_pickle('../output/stl_map_results_competing_models.pkl')

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(4*6.4, 4.8))
plt.rcParams.update({'axes.titlesize': 15, 'axes.labelsize': 15, 'xtick.labelsize':12, 'xtick.labelsize':12})
for ax in axs:
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.set(xlabel='Interactions Observed', ylabel='Avg. Bias')
    ax.set_ylim((0, 1.05))
#    ax.set_xlim((2, 15))

bias_metric_per_task = {'geo-based': 'bias-x___y', 'type-based': 'bias-type', 'mixed': 'posterior-competing_model__x___y___type'}
for ai, t in enumerate(['geo-based', 'type-based', 'mixed']):
    bias_over_time = pd.DataFrame()
    for i, row in stl_map_results[stl_map_results.task == t].iterrows():
        temp_df = pd.DataFrame()
        temp_df[row['participant_id']] = row[bias_metric_per_task[t]]
        bias_over_time = pd.concat([bias_over_time, temp_df], axis=1, ignore_index=True)
    sems = bias_over_time.std(axis=1) / np.sqrt(bias_over_time.count(axis=1))
    mean = bias_over_time.mean(axis=1)
    mean.plot(ax=axs[ai], title=f'Agrregate Bias Detection for {t} Task', label='Competing Models', color='#d95f02')
    axs[ai].fill_between(list(range(len(mean))), mean-2*sems,mean+2*sems, color='#d95f02', alpha=0.3, zorder=100)
    
    

In [None]:
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(4*6.4, 4.8))
plt.rcParams.update({'axes.titlesize': 15, 'axes.labelsize': 15, 'xtick.labelsize':12, 'xtick.labelsize':12})
for ax in axs:
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.set(xlabel='k', ylabel='Avg. Accuracy')
    ax.set_ylim((0, 1.05))
#    ax.set_xlim((2, 15))
ks = [1, 5, 10, 20, 50, 100]
for ai, t in enumerate(['geo-based', 'type-based', 'mixed']):
    df_temp = stl_map_results[stl_map_results.task == t][[f'ncp-{k}' for k in ks]]
    err = df_temp.std() / np.sqrt(len(df_temp))
    df_temp.mean().plot.bar(yerr=err, ax=axs[ai], color='#d95f02', alpha=0.5, title=f'Aggregate Next Click Prediction for {t} Task')

    

In [None]:
stl_map_results