In [None]:
import sys
sys.path.append('../implementation/')
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from tqdm import tqdm
import time
from weighted_k_nearest_neighbors import WeightedKNN
from util import flatten_list
import warnings
warnings.filterwarnings('ignore')

In [None]:
underlying_data = pd.read_csv('../data/political/final/political.csv')
clean_id = [s.replace("p", "") for s in underlying_data['id']]
clean_id = [int(s.lstrip('0')) - 1 for s in clean_id]
underlying_data['id'] = clean_id
underlying_data = underlying_data.set_index('id')
output_file_path = '../output/political/political_knn_mouse.pkl'

interaction_data = pd.read_csv('../data/political/final/wall_political_interactions_mouse.csv')
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)
interaction_data['interaction_type'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_type), axis=1)
c_attrs = ['age', 'political_experience', 'policy_strength_ban_abortion_after_6_weeks', 
           'policy_strength_legalize_medical_marijuana', 'policy_strength_increase_medicare_funding',
          'policy_strength_ban_alcohol_sales_sundays']
d_attrs = ['party', 'gender', 'occupation']
ks = [1, 5, 10, 20, 50, 100]

In [None]:
knn_results = pd.DataFrame()

for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user}')
    results = {'participant_id': row.user}
    knn_model = WeightedKNN(underlying_data, 
                            ['age', 'political_experience', 'policy_strength_ban_abortion_after_6_weeks', 
           'policy_strength_legalize_medical_marijuana', 'policy_strength_increase_medicare_funding',
          'policy_strength_ban_alcohol_sales_sundays'], 
                            ['party', 'gender', 'occupation'])
    predicted = pd.DataFrame()
    rank_predicted = []
    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        knn_model.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = knn_model.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
            
    ncp = predicted.sum()/len(predicted)
    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
        
    bias = knn_model.get_attribute_bias()
    for col in bias.columns:
        results[f'bias-{col}'] = bias[col].to_numpy()
        
    results['rank'] = rank_predicted    
    knn_results = knn_results.append(results, ignore_index=True)
    
knn_results.to_pickle(output_file_path)

In [None]:
ks = [1, 5, 10, 20, 50, 100]

df_temp = knn_results[[f'ncp-{k}' for k in ks]]
err = df_temp.std() / np.sqrt(len(df_temp))
df_temp.mean().plot.bar(yerr=err, color='#d95f02', alpha=0.5, title=f'Aggregate Next Click Prediction for Political Data')