In [1]:
import sys
sys.path.append('../implementation/')
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from tqdm import tqdm
import time
from monadjemi_competing_models import CompetingModels
from util import flatten_list
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the STL Crimes underlying data and user interaction data
data_path = '../data/boardrooms/boardrooms_data.csv'
ui_data_path = '../data/boardrooms/boardrooms_combined_interactions.csv'
output_file_path = '../output/boardrooms/boardrooms_cm.pkl'
# id_column = 'idcompany'

underlying_data = pd.read_csv(data_path)
# underlying_data.set_index(id_column, drop=True, inplace=True)

interaction_data = pd.read_csv(ui_data_path)
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)
# interaction_data['interaction_type_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_type_session), axis=1)

ks = [1, 5, 10, 20, 50, 100]
d_attributes = ['industry']
c_attributes = ['mktcap', 'unrelated', 'female', 'age', 'tenure', 'medianpay']
underlying_data = underlying_data[d_attributes + flatten_list(c_attributes)].copy()

In [None]:
vast_results = pd.DataFrame()
for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user} task {row.task}')
    results = {'participant_id': row.user, 'task': row.task}
    competing_models = CompetingModels(underlying_data, 
                                       ['mktcap', 'unrelated', 'female', 'age', 'tenure', 'medianpay'], 
                                       ['industry'])
    predicted = pd.DataFrame()
    rank_predicted = []
    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        competing_models.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = competing_models.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
            
    ncp = predicted.sum()/len(predicted)
    results['rank'] = rank_predicted    
    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
        
    bias = competing_models.get_attribute_bias()
    for col in bias.columns:
        results[f'bias-{col}'] = bias[col].to_numpy()
        
    posterior = competing_models.get_model_posterior()
    for col in posterior.columns:
        results[f'posterior-{col}'] = posterior[col].to_numpy()
    
    vast_results = vast_results.append(results, ignore_index=True)
    
vast_results.to_pickle(output_file_path)

Processing user 1 task 1
128 competing models enumerated.


  0%|          | 0/20 [00:00<?, ?it/s]

In [4]:
vast_results

Unnamed: 0,bias-age,bias-female,bias-industry,bias-medianpay,bias-mktcap,bias-tenure,bias-unrelated,ncp-1,ncp-10,ncp-100,...,posterior-competing_model__unrelated___female___tenure___medianpay,posterior-competing_model__unrelated___female___tenure___medianpay___industry,posterior-competing_model__unrelated___industry,posterior-competing_model__unrelated___medianpay,posterior-competing_model__unrelated___medianpay___industry,posterior-competing_model__unrelated___tenure,posterior-competing_model__unrelated___tenure___industry,posterior-competing_model__unrelated___tenure___medianpay,posterior-competing_model__unrelated___tenure___medianpay___industry,task
0,"[0.5, 0.4196759563342905, 0.21224218438253123,...","[0.5, 0.21148524611407685, 0.03787461025127298...","[0.5, 0.4449276873404098, 0.7371187916743889, ...","[0.5, 0.45541998322599025, 0.23427926607347868...","[0.5, 0.005995070616665738, 1.4972426520466874...","[0.5, 0.2409215631571678, 0.06690556992470086,...","[0.5, 0.36424978500714605, 0.14190039266838989...",0.0,0.0,0.210526,...,"[0.0078125, 0.004451867559693948, 0.0003361101...","[0.0078125, 0.002649921166484495, 0.0002165211...","[0.0078125, 0.017548504811751183, 0.0094954320...","[0.0078125, 0.03224947653673413, 0.01763772310...","[0.0078125, 0.01919611698615121, 0.01136216959...","[0.0078125, 0.010949974435453659, 0.0020334026...","[0.0078125, 0.006517841925865254, 0.0013099120...","[0.0078125, 0.011495674600631662, 0.0022411251...","[0.0078125, 0.006842663452756921, 0.0014437262...",1.0


In [13]:
underlying_data[['mktcap']].to_numpy()

array([[ 96796.],
       [ 66771.],
       [ 97349.],
       [ 70195.],
       [ 36778.],
       [ 25340.],
       [ 44223.],
       [  5614.],
       [ 14528.],
       [  7476.],
       [ 40023.],
       [  9785.],
       [ 27204.],
       [ 12513.],
       [  7517.],
       [ 29878.],
       [  7178.],
       [ 10825.],
       [ 11699.],
       [ 39803.],
       [  6250.],
       [121575.],
       [ 18264.],
       [ 23966.],
       [498547.],
       [ 15915.],
       [118562.],
       [293398.],
       [ 10598.],
       [ 29133.],
       [ 27804.],
       [ 72105.],
       [ 78006.],
       [ 43301.],
       [ 20188.],
       [ 20865.],
       [ 13275.],
       [119319.],
       [ 16761.],
       [ 33984.],
       [ 18858.],
       [ 36327.],
       [ 26130.],
       [ 17814.],
       [  6124.],
       [666252.],
       [ 20134.],
       [ 27247.],
       [  5447.],
       [206153.],
       [ 12483.],
       [ 40474.],
       [  6990.],
       [ 23912.],
       [ 33983.],
       [ 2