In [4]:
import sys
sys.path.append('../implementation/')
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special as sp
from tqdm import tqdm
import time
from healey_adaboost_naive_bayes import AdaBoostNB
import warnings
from util import flatten_list
warnings.filterwarnings('ignore')

In [5]:
# Loading the STL Crimes underlying data and user interaction data
data_path = '../data/boardrooms/boardrooms_data.csv'
ui_data_path = '../data/boardrooms/boardrooms_combined_interactions.csv'
output_file_path = '../output/boardrooms/boardrooms_ada_nb.pkl'

underlying_data = pd.read_csv(data_path)
underlying_data['industry_code'] = pd.factorize(underlying_data['industry'])[0]
interaction_data = pd.read_csv(ui_data_path)
interaction_data['interaction_session'] = interaction_data.apply(lambda row: ast.literal_eval(row.interaction_session), axis=1)

ks = [1, 5, 10, 20, 50, 100]
d_attributes = ['industry_code']
c_attributes = ['mktcap', 'unrelated', 'female', 'age', 'tenure', 'medianpay']

underlying_data = underlying_data[d_attributes + flatten_list(c_attributes)].copy()

In [6]:
nb_results = pd.DataFrame()

for participant_index, row in interaction_data.iterrows():
    print(f'Processing user {row.user} task {row.task}')
    results = {'participant_id': row.user, 'task': row.task}
    model = AdaBoostNB(underlying_data, c_attributes, d_attributes)
    predicted = pd.DataFrame()
    rank_predicted = []
    for i in tqdm(range(len(interaction_data.iloc[participant_index].interaction_session))):
        interaction = interaction_data.iloc[participant_index].interaction_session[i]
        model.update(interaction)

        if i < len(interaction_data.iloc[participant_index].interaction_session) - 1:
            probability_of_next_point = model.predict()
            next_point = interaction_data.iloc[participant_index].interaction_session[i+1]
            predicted_next_dict = {}
            for k in ks:
                predicted_next_dict[k] = (next_point in probability_of_next_point.nlargest(k).index.values)
            predicted = predicted.append(predicted_next_dict, ignore_index=True)
            sorted_prob = probability_of_next_point.sort_values(ascending=False)
            rank, = np.where(sorted_prob.index.values == next_point)
            rank_predicted.append(rank[0] + 1)
    ncp = predicted.sum()/len(predicted)
    for col in ncp.index:
        results[f'ncp-{col}'] = ncp[col]
    results['rank'] = rank_predicted  
    nb_results = nb_results.append(results, ignore_index=True)
    
nb_results.to_pickle(output_file_path)

Processing user 1 task 1


100%|██████████| 20/20 [00:01<00:00, 11.73it/s]


Processing user 4 task 1


100%|██████████| 14/14 [00:01<00:00, 10.68it/s]


Processing user 12 task 1


100%|██████████| 81/81 [00:08<00:00,  9.81it/s]


Processing user 16 task 1


100%|██████████| 16/16 [00:01<00:00, 11.86it/s]


Processing user 28 task 1


100%|██████████| 52/52 [00:05<00:00, 10.03it/s]


Processing user 34 task 1


100%|██████████| 7/7 [00:00<00:00, 14.70it/s]


Processing user 38 task 1


100%|██████████| 47/47 [00:04<00:00, 10.75it/s]


Processing user 39 task 1


100%|██████████| 4/4 [00:00<00:00, 14.58it/s]


Processing user 40 task 1


100%|██████████| 36/36 [00:03<00:00, 10.97it/s]


Processing user 44 task 1


100%|██████████| 16/16 [00:01<00:00, 11.21it/s]


Processing user 50 task 1


100%|██████████| 11/11 [00:00<00:00, 11.26it/s]


Processing user 57 task 1


100%|██████████| 57/57 [00:05<00:00, 10.69it/s]


Processing user 58 task 1


100%|██████████| 2/2 [00:00<00:00, 68.84it/s]


Processing user 59 task 1


100%|██████████| 26/26 [00:02<00:00, 10.95it/s]


Processing user 61 task 1


100%|██████████| 30/30 [00:02<00:00, 10.63it/s]


Processing user 63 task 1


100%|██████████| 12/12 [00:01<00:00, 11.67it/s]


Processing user 77 task 1


100%|██████████| 22/22 [00:02<00:00, 10.42it/s]


Processing user 83 task 1


100%|██████████| 35/35 [00:03<00:00, 10.97it/s]


Processing user 86 task 1


100%|██████████| 49/49 [00:04<00:00, 10.65it/s]


Processing user 91 task 1


100%|██████████| 13/13 [00:00<00:00, 13.78it/s]


Processing user 100 task 1


100%|██████████| 34/34 [00:02<00:00, 12.22it/s]


Processing user 106 task 1


100%|██████████| 21/21 [00:02<00:00, 10.01it/s]


Processing user 110 task 1


100%|██████████| 12/12 [00:00<00:00, 12.23it/s]


Processing user 117 task 1


100%|██████████| 42/42 [00:04<00:00,  8.90it/s]


Processing user 118 task 1


100%|██████████| 8/8 [00:00<00:00, 25.70it/s]


Processing user 119 task 1


100%|██████████| 25/25 [00:02<00:00, 11.97it/s]


Processing user 123 task 1


100%|██████████| 28/28 [00:02<00:00, 10.51it/s]


Processing user 130 task 1


100%|██████████| 25/25 [00:02<00:00, 11.96it/s]


Processing user 133 task 1


100%|██████████| 48/48 [00:04<00:00, 10.16it/s]


Processing user 141 task 1


100%|██████████| 79/79 [00:07<00:00, 10.23it/s]


Processing user 146 task 1


100%|██████████| 29/29 [00:02<00:00, 11.39it/s]


Processing user 157 task 1


100%|██████████| 5/5 [00:00<00:00, 18.08it/s]


Processing user 164 task 1


100%|██████████| 37/37 [00:03<00:00, 10.45it/s]


Processing user 165 task 1


100%|██████████| 49/49 [00:04<00:00, 10.29it/s]


Processing user 166 task 1


100%|██████████| 9/9 [00:00<00:00, 12.20it/s]


Processing user 177 task 1


100%|██████████| 30/30 [00:02<00:00, 11.56it/s]


Processing user 180 task 1


100%|██████████| 18/18 [00:01<00:00, 11.68it/s]


Processing user 181 task 1


100%|██████████| 2/2 [00:00<00:00, 85.76it/s]


Processing user 182 task 1


100%|██████████| 15/15 [00:01<00:00, 10.87it/s]


Processing user 183 task 1


100%|██████████| 20/20 [00:01<00:00, 11.35it/s]


Processing user 185 task 1


100%|██████████| 14/14 [00:01<00:00, 12.38it/s]
