In [None]:
import sys
sys.path.append('../implementation')
import numpy as np
import pandas as pd
import ast
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.special as sp
import time
from util import flatten_list
from ottley_hidden_markov_model import HMM
import warnings
warnings.filterwarnings('ignore')

In [None]:
underlying_data = pd.read_csv('../data/political/final/political.csv')
clean_id = [s.replace("p", "") for s in underlying_data['id']]
clean_id = [int(s.lstrip('0')) - 1 for s in clean_id]
underlying_data['id'] = clean_id
underlying_data = underlying_data.set_index('id')
underlying_data = underlying_data.sort_index()
output_file_path = '../output/political/political_hmm.pkl'

interaction_data = pd.read_csv('../data/political/final/wall_political_interactions.csv')
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)
interaction_data['interaction_type'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_type), axis=1)
c_attrs = ['age', 'political_experience', 'policy_strength_ban_abortion_after_6_weeks', 
           'policy_strength_legalize_medical_marijuana', 'policy_strength_increase_medicare_funding',
          'policy_strength_ban_alcohol_sales_sundays']
d_attrs = ['party', 'gender', 'occupation']
ks = [1, 5, 10, 20, 50, 100]

In [None]:
# Not necessary to run if we already have results file for HMM
# Running HMM through all user interaction sessions and saving results in file
hmm_results = pd.DataFrame()

for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user}')
    results = {'participant_id': row.user}
    hmm = HMM(underlying_data, ['age', 'political_experience', 'policy_strength_ban_abortion_after_6_weeks', 
           'policy_strength_legalize_medical_marijuana', 'policy_strength_increase_medicare_funding',
          'policy_strength_ban_alcohol_sales_sundays'], ['party', 'gender', 'occupation'], 1000)
    predicted = pd.DataFrame()
    rank_predicted = []
    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        hmm.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = hmm.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
            
    ncp = predicted.sum()/len(predicted)
    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
        
    bias = hmm.get_attribute_bias()
    for col in bias.columns:
        results[f'bias-{col}'] = bias[col].to_numpy()
    
    results['rank'] = rank_predicted
    hmm_results = hmm_results.append(results, ignore_index=True)
    
hmm_results.to_pickle(output_file_path)

In [None]:
politcal_results = pd.read_pickle(output_file_path)

fig, axs = plt.subplots(6, 2, sharey=True, figsize=(20, 15))
fig.tight_layout(pad=4)
fig.suptitle('HMM Bias Detection for Political Data', fontsize=20)
fig.text(0.5, 0.03, 'Interactions Observed', ha='center')
fig.text(0.03, 0.5, 'Bias', va='center', rotation='vertical')
plt.rcParams.update({'axes.titlesize': 15, 'axes.labelsize': 15,
                     'xtick.labelsize':12, 'xtick.labelsize':12})
for (m,n), ax in np.ndenumerate(axs):
    ax.set_ylim((0, 1.05))
#     ax.set(xlabel='Interactions Observed', ylabel= 'Bias')
#     ax.set_xlim(left=1)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_color('black')
    ax.spines['bottom'].set_color('black')

bias_metric_per_task = {'party': 'bias-bias_party', 'gender': 'bias-bias_gender', 'occupation':'bias-bias_occupation',
                       'age': 'bias-bias_age', 'political_experience': 'bias-bias_political_experience',
                       'policy_strength_ban_abortion_after_6_weeks': 'bias-bias_policy_strength_ban_abortion_after_6_weeks',
                       'policy_strength_legalize_medical_marijuana':'bias-bias_policy_strength_legalize_medical_marijuana',
                        'policy_strength_increase_medicare_funding': 'bias-bias_policy_strength_increase_medicare_funding',
                       'policy_strength_ban_alcohol_sales_sundays': 'bias-bias_policy_strength_ban_alcohol_sales_sundays'}

columns = ['party', 'gender', 'occupation', 'age', 'political_experience', 'policy_strength_ban_abortion_after_6_weeks',
                       'policy_strength_legalize_medical_marijuana', 'policy_strength_increase_medicare_funding',
                       'policy_strength_ban_alcohol_sales_sundays']
index_1 = 0
index_2 = 0
for index, row in politcal_results.iterrows():
    bias_over_time= pd.DataFrame()
    for ai, attr in enumerate(columns):
        temp_df = pd.DataFrame()
        temp_df[row['participant_id']] = row[bias_metric_per_task[attr]]
        bias_over_time = pd.concat([bias_over_time, temp_df], axis=1, ignore_index=True)
    bias_over_time.columns = columns
    bias_over_time.plot(ax=axs[index_1, index_2], title=f'Bias Detection for {row["participant_id"]}', legend = 0)
    if index != 11:
        if (index_2 == 1):
            index_2 = 0
            index_1 += 1
        else:
            index_2 += 1
    else:
        handles, labels = axs[index_1, index_2].get_legend_handles_labels()
        n_cols = round(len(columns)/2)
        fig.legend(handles, labels, loc='lower center', ncol=n_cols, bbox_to_anchor=(0.5, -0.07))

In [None]:
political_results = pd.read_pickle(output_file_path)
ks = [1, 5, 10, 20, 50, 100]

df_temp = political_results[[f'ncp-{k}' for k in ks]]
err = df_temp.std() / np.sqrt(len(df_temp))
df_temp.mean().plot.bar(yerr=err, color='#d95f02', alpha=0.5, title=f'Aggregate Next Click Prediction for Political Data')