In [1]:
import sys
sys.path.append('../implementation/')
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from tqdm import tqdm
import time
from weighted_k_nearest_neighbors import WeightedKNN
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the STL Crimes underlying data and user interaction data
underlying_data = pd.read_csv('../data/stl_crimes/dots.csv')
underlying_data.set_index('id', drop=True, inplace=True)
output_file_path = '../output/stl/stl_map_results_knn_uniform_weight.pkl'

ks = [1, 5, 10, 20, 50, 100]
interaction_data = pd.read_csv('../data/stl_crimes/stl_combined_interactions.csv')
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)
interaction_data['interaction_type_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_type_session), axis=1)

In [7]:
# run for a single session
interaction_index = 35

model = WeightedKNN(underlying_data, [['x','y']], ['type'], k=20)
predicted = pd.DataFrame()
for i in tqdm(range(len(interaction_data.iloc[interaction_index].interaction_session))):
    interaction = interaction_data.iloc[interaction_index].interaction_session[i]
    model.update(interaction)
    
    if i < len(interaction_data.iloc[interaction_index].interaction_session) - 1:
        probability_of_next_point = model.predict()
        next_point = interaction_data.iloc[interaction_index].interaction_session[i+1]
        predicted_next_dict = {}
        for k in ks:
            predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
        predicted = predicted.append(predicted_next_dict, ignore_index=True)

# success rate for predicting next click in the top-k
ncp = predicted.sum()/len(predicted)
print(ncp)

print(model.get_attribute_bias())

Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 96/96 [00:01<00:00, 69.28it/s]

1      0.000000
5      0.042105
10     0.073684
20     0.178947
50     0.610526
100    0.905263
dtype: float64
    x___y  type
0     1.0   1.0
1     1.0   1.0
2     1.0   1.0
3     1.0   1.0
4     1.0   1.0
..    ...   ...
91    1.0   1.0
92    1.0   1.0
93    1.0   1.0
94    1.0   1.0
95    1.0   1.0

[96 rows x 2 columns]





In [None]:
model.get_attribute_bias().plot()

In [3]:
stl_map_results = pd.DataFrame()

for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user} task {row.task}')
    results = {'participant_id': row.user, 'task': row.task}
    knn_model = WeightedKNN(underlying_data, [['x', 'y']], ['type'])
    predicted = pd.DataFrame()
    rank_predicted = []

    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        knn_model.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = knn_model.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
            
    ncp = predicted.sum()/len(predicted)
    
    results['rank'] = rank_predicted
    
    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
        
    bias = knn_model.get_attribute_bias()
    for col in bias.columns:
        results[f'bias-{col}'] = bias[col].to_numpy()
    results['bias-mixed'] = results['bias-x___y'] * results['bias-type']
        
    stl_map_results = stl_map_results.append(results, ignore_index=True)
    
stl_map_results.to_pickle(output_file_path)

Processing user 15 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 54/54 [00:01<00:00, 35.43it/s]


Processing user 14 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 46/46 [00:00<00:00, 46.86it/s]


Processing user 28 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 47/47 [00:00<00:00, 48.01it/s]


Processing user 16 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 44/44 [00:01<00:00, 41.19it/s]


Processing user 17 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 50/50 [00:01<00:00, 44.66it/s]


Processing user 13 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 31/31 [00:00<00:00, 43.85it/s]


Processing user 12 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 34/34 [00:00<00:00, 53.38it/s]


Processing user 10 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 34/34 [00:00<00:00, 41.15it/s]


Processing user 11 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 28/28 [00:00<00:00, 52.24it/s]


Processing user 9 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 29/29 [00:00<00:00, 41.40it/s]


Processing user 8 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 41/41 [00:00<00:00, 53.86it/s]


Processing user 5 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 22/22 [00:00<00:00, 56.06it/s]


Processing user 4 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 32/32 [00:00<00:00, 56.34it/s]


Processing user 6 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 31/31 [00:00<00:00, 47.37it/s]


Processing user 7 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 35/35 [00:00<00:00, 54.30it/s]


Processing user 3 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 32/32 [00:00<00:00, 49.55it/s]


Processing user 2 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 33/33 [00:00<00:00, 46.19it/s]


Processing user 1 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 28/28 [00:00<00:00, 47.30it/s]


Processing user 20 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 46/46 [00:01<00:00, 45.07it/s]


Processing user 21 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 50/50 [00:01<00:00, 42.65it/s]


Processing user 23 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 47/47 [00:01<00:00, 44.70it/s]


Processing user 22 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 49/49 [00:01<00:00, 43.11it/s]


Processing user 26 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 46/46 [00:01<00:00, 44.96it/s]


Processing user 27 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 46/46 [00:01<00:00, 34.41it/s]


Processing user 25 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 57/57 [00:01<00:00, 39.58it/s]


Processing user 19 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 47/47 [00:01<00:00, 39.00it/s]


Processing user 18 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 50/50 [00:01<00:00, 39.88it/s]


Processing user 24 task geo-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 48/48 [00:01<00:00, 39.82it/s]


Processing user 15 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 34/34 [00:00<00:00, 39.30it/s]


Processing user 14 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 39/39 [00:00<00:00, 57.28it/s]


Processing user 16 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 38/38 [00:00<00:00, 45.29it/s]


Processing user 17 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 35/35 [00:00<00:00, 58.65it/s]


Processing user 13 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 57/57 [00:00<00:00, 62.84it/s]


Processing user 12 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 121/121 [00:02<00:00, 60.26it/s]


Processing user 10 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 99/99 [00:01<00:00, 57.79it/s]


Processing user 11 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 96/96 [00:01<00:00, 62.51it/s]


Processing user 9 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 89/89 [00:01<00:00, 55.59it/s]


Processing user 8 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 94/94 [00:01<00:00, 61.73it/s]


Processing user 5 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 101/101 [00:01<00:00, 55.03it/s]


Processing user 4 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 118/118 [00:01<00:00, 62.11it/s]


Processing user 6 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 99/99 [00:01<00:00, 56.36it/s]


Processing user 7 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 108/108 [00:01<00:00, 61.64it/s]


Processing user 3 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 87/87 [00:01<00:00, 61.88it/s]


Processing user 2 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 90/90 [00:01<00:00, 61.55it/s]


Processing user 1 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 75/75 [00:01<00:00, 61.90it/s]


Processing user 20 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 47/47 [00:00<00:00, 62.31it/s]


Processing user 21 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 43/43 [00:00<00:00, 51.01it/s]


Processing user 23 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 17/17 [00:00<00:00, 62.62it/s]


Processing user 22 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 37/37 [00:00<00:00, 60.40it/s]


Processing user 26 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 38/38 [00:00<00:00, 62.33it/s]


Processing user 27 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 43/43 [00:00<00:00, 62.04it/s]


Processing user 25 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 39/39 [00:00<00:00, 59.97it/s]


Processing user 19 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 38/38 [00:00<00:00, 62.43it/s]


Processing user 18 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 40/40 [00:00<00:00, 42.80it/s]


Processing user 24 task mixed
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 48/48 [00:00<00:00, 57.29it/s]


Processing user 15 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 10/10 [00:00<00:00, 49.59it/s]


Processing user 14 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 6/6 [00:00<00:00, 61.25it/s]


Processing user 16 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 19/19 [00:00<00:00, 56.90it/s]


Processing user 17 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 11/11 [00:00<00:00, 63.85it/s]


Processing user 13 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 14/14 [00:00<00:00, 64.17it/s]


Processing user 12 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 11/11 [00:00<00:00, 62.29it/s]


Processing user 10 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 11/11 [00:00<00:00, 63.46it/s]


Processing user 11 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 5/5 [00:00<00:00, 66.09it/s]


Processing user 9 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 8/8 [00:00<00:00, 64.47it/s]


Processing user 8 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 8/8 [00:00<00:00, 67.45it/s]


Processing user 5 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 8/8 [00:00<00:00, 67.21it/s]


Processing user 4 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 6/6 [00:00<00:00, 66.48it/s]


Processing user 6 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 12/12 [00:00<00:00, 65.34it/s]


Processing user 7 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 6/6 [00:00<00:00, 66.23it/s]


Processing user 3 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 8/8 [00:00<00:00, 51.97it/s]


Processing user 2 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 12/12 [00:00<00:00, 62.03it/s]


Processing user 1 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 6/6 [00:00<00:00, 66.81it/s]


Processing user 20 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 13/13 [00:00<00:00, 63.78it/s]


Processing user 21 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 13/13 [00:00<00:00, 63.85it/s]


Processing user 23 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 16/16 [00:00<00:00, 61.18it/s]


Processing user 22 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 21/21 [00:00<00:00, 59.37it/s]


Processing user 19 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 11/11 [00:00<00:00, 61.23it/s]


Processing user 18 task type-based
Computing neighborhood matrix for x___y
Computing neighborhood matrix for type


100%|██████████| 12/12 [00:00<00:00, 54.40it/s]


In [15]:
stl_map_results

Unnamed: 0,participant_id,task,rank,ncp-1,ncp-5,ncp-10,ncp-20,ncp-50,ncp-100,bias-x___y,bias-type,bias-mixed
0,15,geo-based,"[32, 0, 2, 30, 7, 15, 17, 16, 15, 14, 15, 19, ...",0.037736,0.075472,0.113208,0.264151,0.584906,0.735849,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,14,geo-based,"[1736, 1839, 40, 4, 1, 1, 55, 58, 64, 29, 33, ...",0.066667,0.066667,0.133333,0.266667,0.577778,0.755556,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,28,geo-based,"[32, 0, 2, 22, 1, 7, 19, 24, 42, 45, 26, 29, 3...",0.021739,0.086957,0.130435,0.282609,0.717391,0.847826,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,16,geo-based,"[9, 47, 10, 2, 16, 13, 23, 30, 25, 35, 50, 37,...",0.000000,0.046512,0.093023,0.139535,0.627907,0.790698,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,17,geo-based,"[32, 0, 34, 2, 4, 12, 25, 49, 57, 63, 62, 17, ...",0.020408,0.081633,0.204082,0.346939,0.551020,0.795918,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
73,21,type-based,"[7, 5, 4, 8, 0, 0, 3, 4, 6, 3, 8, 6]",0.083333,0.500000,0.916667,0.916667,1.000000,1.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
74,23,type-based,"[7, 8, 3, 8, 0, 0, 1, 3, 2, 4, 7, 4, 6, 19, 20]",0.133333,0.533333,0.733333,0.866667,1.000000,1.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
75,22,type-based,"[0, 17, 12, 0, 1, 0, 6, 8, 7, 8, 5, 1, 10, 5, ...",0.250000,0.450000,0.650000,0.950000,1.000000,1.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
76,19,type-based,"[1, 0, 3, 8, 11, 5, 10, 8, 6, 3]",0.100000,0.500000,0.900000,1.000000,1.000000,1.000000,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [None]:
fig, axs = plt.subplots(1, 3, sharey=True, figsize=(4*6.4, 4.8))
plt.rcParams.update({'axes.titlesize': 15, 'axes.labelsize': 15, 'xtick.labelsize':12, 'xtick.labelsize':12})
for ax in axs:
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')
    ax.set(xlabel='Interactions Observed', ylabel='Avg. Bias')
    ax.set_ylim((0, 1.05))
#    ax.set_xlim((2, 15))

bias_metric_per_task_knn = {'geo-based': 'bias-x___y', 'type-based': 'bias-type', 'mixed': 'bias-mixed'}
for ai, t in enumerate(['geo-based', 'type-based', 'mixed']):
    bias_over_time_knn = pd.DataFrame()
    for i, row in stl_map_results[stl_map_results.task == t].iterrows():
        temp_df = pd.DataFrame()
        temp_df[row['participant_id']] = row[bias_metric_per_task_knn[t]]
        bias_over_time_knn = pd.concat([bias_over_time_knn, temp_df], axis=1, ignore_index=True)
    sems_knn = bias_over_time_knn.std(axis=1) / np.sqrt(bias_over_time_knn.count(axis=1))
    mean_knn = bias_over_time_knn.mean(axis=1)
    mean_knn.plot(ax=axs[ai], title=f'Aggregate Bias Detection for {t} Task', label='KNN', color='#1F77B4')
    axs[ai].fill_between(list(range(len(mean_knn))), mean_knn-2*sems_knn,mean_knn+2*sems_knn, color='#1F77B4', alpha=0.3, zorder=100)
    axs[0].legend()