In [1]:
import json
import sys
import pandas as pd
import ast
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
sys.path.append('../implementation/')

In [2]:
def boardrooms_data():

    # read in the data
    data_path = "../data/boardrooms/old_version/data.json"
    attributes_of_interest = ['idcompany', 'companyname', 'mktcap', 'unrelated', 'female', 'age', 'tenure', 'medianpay', 'industry']
    with open(data_path, 'r') as data_file:
        data = json.loads(data_file.read())
    data_file.close()

    data_array = np.array([tuple(row[attr] for attr in attributes_of_interest) for row in data],
                          dtype=[('idcompany', 'int'), ('companyname', 'U40'), ('mktcap', 'float'), ('unrelated', 'float'), ('female', 'float'), ('age', 'float'), ('tenure', 'float'), ('medianpay', 'float'), ('industry', 'U40')])
    df = pd.DataFrame(data_array)
    df.set_index('idcompany', drop=True, inplace=True)
    df = df.reset_index()
    return df

In [3]:
def boardrooms_ui_data():
    # read in the user interaction data
    interaction_data_path = "../data/boardrooms/old_version/feng_experiment_interaction_data/searchinvis-boardrooms-per-visit.csv"
    user_interaction_data = pd.read_csv(interaction_data_path)
    user_interaction_data = user_interaction_data.dropna(subset=['id', 'code', 'duration'])
    user_interaction_data = user_interaction_data[user_interaction_data['duration'] > 1000]
    
    underlying_data = pd.read_csv("../data/boardrooms/boardrooms_data.csv")

    group_list = [1,2,3]
    full_interactions = pd.DataFrame(columns = ['user', 'task', 'interaction_session'])
    for group in group_list:
        if group == 1:
            user_interaction_data = user_interaction_data[np.logical_and(user_interaction_data['condition'] == 'foresight', user_interaction_data['if_search_factor'] == 'search')]
        elif group == 2:
            user_interaction_data = user_interaction_data[np.logical_and(user_interaction_data['condition'] == 'foresight', user_interaction_data['if_search_factor'] == 'non-search')]
        elif group == 3:
            user_interaction_data = user_interaction_data[user_interaction_data['condition'] == 'control']
        else:
            raise Exception('group argument missing')

        interactions_by_participant = {}
        print(str(group) + ': ', len(user_interaction_data))
        for index, row in user_interaction_data.iterrows():
            id = int(row['id'])
            if id not in interactions_by_participant.keys():
                interactions_by_participant[id] = []
            comp_id = int(row['code'])
            index_id = underlying_data.index[underlying_data['idcompany'] == comp_id].tolist()[0]
            interactions_by_participant[id].append(index_id)
        for id in interactions_by_participant.keys():
            full_interactions = full_interactions.append({'user': id, 'task': group, 'interaction_session': interactions_by_participant[id]},  ignore_index = True)
        
    return full_interactions

In [5]:
df_ui = boardrooms_ui_data()
df_ui
# df_ui.to_csv("../data/boardrooms/boardrooms_combined_interactions.csv", index=False)

1:  1100
2:  0
3:  0


Unnamed: 0,user,task,interaction_session
0,1,1,"[476, 476, 474, 476, 466, 45, 27, 180, 476, 81..."
1,4,1,"[301, 180, 24, 45, 200, 460, 444, 261, 192, 18..."
2,12,1,"[45, 121, 45, 121, 45, 121, 136, 96, 45, 24, 4..."
3,16,1,"[121, 121, 27, 27, 476, 351, 180, 27, 66, 476,..."
4,28,1,"[123, 370, 381, 306, 424, 90, 476, 45, 98, 110..."
5,34,1,"[315, 428, 49, 121, 180, 45, 425]"
6,38,1,"[45, 125, 98, 208, 386, 476, 474, 434, 324, 29..."
7,39,1,"[121, 45, 121, 45]"
8,40,1,"[121, 262, 315, 428, 298, 401, 135, 273, 374, ..."
9,44,1,"[45, 211, 261, 180, 446, 175, 383, 163, 91, 53..."


In [5]:
df = boardrooms_data()
df.to_csv("../data/boardrooms/boardrooms_data.csv", index=False)