In [160]:
import pandas as pd

In [161]:
# Load the data
DATA_DIR = '../data/server_data/'

## Participant data

In [208]:
participant_log = pd.read_csv(DATA_DIR + 'participant_log.csv')

# rename columns to lowercase and replace spaces with underscores
participant_log.columns = participant_log.columns.str.lower().str.replace(' ', '_')

## keep only columns pid, session_id, scenario
participant_log = participant_log[['pid', 'session_id', 'scenario']]

## if pid is NaN, then replace it with the value of the previous row
participant_log['pid'] = participant_log['pid'].ffill()

In [209]:
participant_session_ids = participant_log['session_id'].unique()

In [210]:
participant_log

Unnamed: 0,pid,session_id,scenario
0,1.0,58665f7d-5484-4769-aef6-741a40c581c8,Hotel
1,2.0,86a7af6a-cf1f-4afa-b2d8-48fb7394892f,Hotel
2,2.0,750943b6-a490-4ee2-9582-ebc75ec103df,
3,3.0,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,Hotel
4,3.0,f0a679ee-d321-417c-8504-8ccacd284aaf,
5,4.0,873af403-ed6c-4d77-b037-ebb30f1a596e,Airlines
6,5.0,e0bbf5be-a122-419d-994d-a273f4a37fde,Airlines
7,6.0,55238f70-fc6f-457b-8d3f-8a1679dc77e3,Airlines
8,6.0,4b8bb159-88fd-4a12-b72b-966981d385ef,
9,7.0,af1f7c73-82fb-402e-b5af-6657aba0f647,Hotel


## Simulation data

In [211]:
def load_simulation_data(filename):
    simulation_data = pd.read_csv(DATA_DIR + filename, sep='\t')
    simulation_data = simulation_data[simulation_data['session_id'].isin(participant_session_ids)]
    return simulation_data

chat_client_info = load_simulation_data('chat_client_info.tsv')
chat_history = load_simulation_data('chat_history.tsv')
chat_in_task = load_simulation_data('chat_in_task.tsv')
chat_post_task = load_simulation_data('chat_post_task.tsv')
chat_pre_task = load_simulation_data('chat_pre_task.tsv')

In [212]:
## Get client information for session_id in participant_log where pid==2
def get_data_by_pid(df, pid):
    session_ids_by_pid = participant_log[participant_log['pid']==pid]['session_id']
    return df[df['session_id'].isin(session_ids_by_pid)]

## Get data given PID and client number
def get_data_by_pid_client(df, pid, client_num):
    client_list = get_data_by_pid(chat_client_info, pid)['client_id'].unique()
    sub_df = get_data_by_pid(df, pid)
    keep_client = client_list[client_num]
    return sub_df[sub_df['client_id']==keep_client]

## Get client turn count
def get_client_turn_count(pid):
    turn_count = {}
    for x in range(0,client_list.size):
        max_turn = get_data_by_pid_client(chat_in_task,pid,x).turn_number.max()
        turn_count[client_list[x]]=max_turn
    return  turn_count

## replace client_id A with client_id B in chat_post_task
def replace_client_id(df, old_client_id, new_client_id):
    df.loc[df['client_id']==old_client_id, 'client_id'] = new_client_id
    return df

## Join participant_log with all data by session_id
def join_to_participant_data(participant_log, data):
    participant_log = participant_log[['pid', 'session_id']]
    return participant_log.merge(data, on='session_id', how='inner')

### Special cases

Due to technical difficulties or other reasons, some participants have multiple session_ids or other issues. We will address these cases below.

**Anne T. is a special case, as she has two different session_ids**
- Pre-task should only keep the session_id: 86a7af6a-cf1f-4afa-b2d8-48fb7394892f
    - Skip:  750943b6-a490-4ee2-9582-ebc75ec103df
- Only 3 clients are in the task
- Post-task only contains one response

Can use data for in-task comparisons but not for pre/post-task because they have not completed a post-task survey for the civil=0, emo=0

In [213]:
client_info = get_data_by_pid(chat_client_info,2)
client_list = client_info['client_id'].unique()
print(client_list)

['e4c843d6-e047-4f71-b842-a86de1b1f4bd'
 'a5982689-193e-4045-93d8-4a5ffae21d10'
 '604b7169-251b-49d5-a848-7401ef63bce8'
 '05d35294-87cd-4434-9f59-c4df07201290'
 'a7b0770b-84cc-4b0f-a5e0-d52db0011d33']


In [214]:
client_info[['client_id','client_name','civil','emo']]

Unnamed: 0,client_id,client_name,civil,emo
45,e4c843d6-e047-4f71-b842-a86de1b1f4bd,Elijah P.,1.0,0.0
46,a5982689-193e-4045-93d8-4a5ffae21d10,Jamal K.,0.0,0.0
47,604b7169-251b-49d5-a848-7401ef63bce8,Anna Z.,0.0,1.0
48,05d35294-87cd-4434-9f59-c4df07201290,Luis H.,0.0,1.0
49,a7b0770b-84cc-4b0f-a5e0-d52db0011d33,Samantha K.,0.0,1.0


In [215]:
get_data_by_pid(chat_pre_task,2)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,client_param,session_id,timestamp
11,66a93d46069ba571d342548f,1,1,1,4,4,1,1,name=Elijah%20P.&domain=hotel&category=Service...,86a7af6a-cf1f-4afa-b2d8-48fb7394892f,2024-07-30 19:21:42.195
12,66a941203e2a64e72bcda6e1,2,2,1,4,4,1,1,name=Jamal%20K.&domain=hotel&category=Policy&g...,750943b6-a490-4ee2-9582-ebc75ec103df,2024-07-30 19:38:08.761


In [216]:
### Remove row where session_id is '750943b6-a490-4ee2-9582-ebc75ec103df'
chat_pre_task = chat_pre_task[chat_pre_task['session_id']!='750943b6-a490-4ee2-9582-ebc75ec103df']

In [217]:
turn_count = get_client_turn_count(2)
keep_clients = [x for x in client_list if turn_count[x]>2]

In [218]:
keep_clients

['e4c843d6-e047-4f71-b842-a86de1b1f4bd',
 '604b7169-251b-49d5-a848-7401ef63bce8',
 '05d35294-87cd-4434-9f59-c4df07201290']

In [219]:
get_data_by_pid_client(chat_in_task,2,1).turn_number.max()

1

In [220]:
get_data_by_pid(chat_post_task,2)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,support_effective,support_helpful,support_beneficial,support_adequate,support_sensitive,support_caring,support_understanding,support_supportive,client_id,session_id,timestamp
30,66a944b83e2a64e72bcda6f0,0,1,0,5,5,-1,1,2,2,2,2,2,2,2,2,604b7169-251b-49d5-a848-7401ef63bce8,750943b6-a490-4ee2-9582-ebc75ec103df,2024-07-30 19:53:28.353


**Joey J. is a special case, as he has two different session_ids**
- Pre-task should only keep the session_id: 5cf80c68-af61-4ea9-b7ea-0f88ecf0c932 
    - Skip:  f0a679ee-d321-417c-8504-8ccacd284aaf

Can use data for all comparisons

In [221]:
client_info = get_data_by_pid(chat_client_info, 3)
client_list = client_info['client_id'].unique()
print(client_list)

['d02e2e69-cab5-4f8a-a4b5-9412506211a2'
 '2b0cfa37-6abd-41cc-8476-dd07742d613a'
 '88499c85-af2a-4bb6-80d2-f6bc6dc7d8d1'
 '543af661-e1c5-453e-8cab-61aada59798a'
 '2ed7e44f-4e28-40be-9fae-b086004ffc4d'
 '917450bb-a7a2-4d2d-b466-9df647ace933'
 '06eca6b1-25da-41f9-8e51-524e8f9ebe76'
 '594bb532-7154-4f2b-ac68-2c0bb9601f7d']


In [222]:
turn_count = get_client_turn_count(3)
keep_clients = [x for x in client_list if turn_count[x]>2]

In [223]:
keep_clients

['2b0cfa37-6abd-41cc-8476-dd07742d613a',
 '543af661-e1c5-453e-8cab-61aada59798a',
 '06eca6b1-25da-41f9-8e51-524e8f9ebe76',
 '594bb532-7154-4f2b-ac68-2c0bb9601f7d']

In [224]:
client_info[client_info['client_id'].isin(keep_clients)][['session_id','client_id','client_name','civil','emo']]

Unnamed: 0,session_id,client_id,client_name,civil,emo
51,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,2b0cfa37-6abd-41cc-8476-dd07742d613a,Anna Z.,1.0,0.0
53,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,543af661-e1c5-453e-8cab-61aada59798a,Luis H.,1.0,0.0
56,f0a679ee-d321-417c-8504-8ccacd284aaf,06eca6b1-25da-41f9-8e51-524e8f9ebe76,Luis H.,0.0,0.0
57,f0a679ee-d321-417c-8504-8ccacd284aaf,594bb532-7154-4f2b-ac68-2c0bb9601f7d,Jamal K.,0.0,1.0


In [225]:
get_data_by_pid(chat_pre_task,3)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,client_param,session_id,timestamp
13,66a974363e2a64e72bcda6fd,1,1,1,4,3,1,1,name=Anna%20Z.&domain=hotel&category=Product%2...,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,2024-07-30 23:16:06.115
14,66a97c45069ba571d34254ca,2,2,1,4,3,2,2,name=Samantha%20K.&domain=airlines&category=Pr...,f0a679ee-d321-417c-8504-8ccacd284aaf,2024-07-30 23:50:29.107


In [226]:
### Remove row where session_id is 'f0a679ee-d321-417c-8504-8ccacd284aaf'
chat_pre_task = chat_pre_task[chat_pre_task['session_id']!='f0a679ee-d321-417c-8504-8ccacd284aaf']

In [227]:
get_data_by_pid(chat_post_task,3)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,support_effective,support_helpful,support_beneficial,support_adequate,support_sensitive,support_caring,support_understanding,support_supportive,client_id,session_id,timestamp
31,66a97b73069ba571d34254c9,2,3,2,3,4,2,1,2,2,2,2,1,1,2,2,543af661-e1c5-453e-8cab-61aada59798a,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,2024-07-30 23:46:59.919
32,66a97b9d3e2a64e72bcda704,2,3,2,3,4,2,1,2,2,2,2,1,1,2,2,543af661-e1c5-453e-8cab-61aada59798a,5cf80c68-af61-4ea9-b7ea-0f88ecf0c932,2024-07-30 23:47:41.709
33,66a98013069ba571d34254e1,-1,-2,-2,4,2,-1,-1,2,2,2,1,1,1,2,1,06eca6b1-25da-41f9-8e51-524e8f9ebe76,f0a679ee-d321-417c-8504-8ccacd284aaf,2024-07-31 00:06:43.276
34,66a983eb3e2ed7be7c7e789a,-1,-1,-1,5,2,-1,-1,2,2,1,1,1,1,1,1,594bb532-7154-4f2b-ac68-2c0bb9601f7d,f0a679ee-d321-417c-8504-8ccacd284aaf,2024-07-31 00:23:07.348


**Brittany is a special case, as she has two different session_ids**
- Pre-task should only keep the session_id: 55238f70-fc6f-457b-8d3f-8a1679dc77e3
    - Skip:  4b8bb159-88fd-4a12-b72b-966981d385ef
- Post-task for the client_id: 0571ce3a-d4b7-430d-8c70-f2ac6c6f42da
    - Is in  31633a46-f92e-4ecb-be08-580fb09eb876

In [228]:
client_info = get_data_by_pid(chat_client_info, 6)
client_list = client_info['client_id'].unique()
print(client_list)

['c8862574-9c18-4d38-adfc-00b04ace045c'
 '3e1f2059-4407-494c-be55-6eac76299913'
 '7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8'
 '5ff55094-5811-4d9d-99db-58e1091cced6'
 'dd4c4e5e-9afc-4461-9187-7f5a2274aff1'
 'b46d9268-9197-4779-bee8-485462440462'
 '54cd52c1-bbe3-41c8-9a30-6c58801a2b0d'
 'ddb7b6c3-f8d8-471d-b60c-60500075f592'
 '0571ce3a-d4b7-430d-8c70-f2ac6c6f42da'
 '31633a46-f92e-4ecb-be08-580fb09eb876']


In [229]:
turn_count = get_client_turn_count(6)
keep_clients = [x for x in client_list if turn_count[x]>2]

In [230]:
keep_clients

['c8862574-9c18-4d38-adfc-00b04ace045c',
 '3e1f2059-4407-494c-be55-6eac76299913',
 '7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8',
 '5ff55094-5811-4d9d-99db-58e1091cced6',
 '0571ce3a-d4b7-430d-8c70-f2ac6c6f42da']

In [231]:
client_info[client_info['client_id'].isin(keep_clients)][['session_id','client_id','client_name','civil','emo']]

Unnamed: 0,session_id,client_id,client_name,civil,emo
76,55238f70-fc6f-457b-8d3f-8a1679dc77e3,c8862574-9c18-4d38-adfc-00b04ace045c,Samantha K,1.0,0.0
77,55238f70-fc6f-457b-8d3f-8a1679dc77e3,3e1f2059-4407-494c-be55-6eac76299913,Jamal K,1.0,0.0
78,55238f70-fc6f-457b-8d3f-8a1679dc77e3,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,Elijah P,0.0,0.0
79,55238f70-fc6f-457b-8d3f-8a1679dc77e3,5ff55094-5811-4d9d-99db-58e1091cced6,Anna Z,0.0,1.0
84,4b8bb159-88fd-4a12-b72b-966981d385ef,0571ce3a-d4b7-430d-8c70-f2ac6c6f42da,Samantha K,0.0,1.0


In [232]:
### Remove row where client_id is '5ff55094-5811-4d9d-99db-58e1091cced6'
client_info = client_info[client_info['client_id']!='5ff55094-5811-4d9d-99db-58e1091cced6']

In [233]:
get_data_by_pid(chat_pre_task,6)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,client_param,session_id,timestamp
20,66ac18f50c9a67655550109a,1,0,1,3,3,1,0,name=Samantha%20K&domain=airlines&category=Res...,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:23:33.369
21,66ac2115f122a1c9aea2cff1,2,2,1,3,3,1,1,name=Luis%20H&domain=airlines&category=Policy&...,4b8bb159-88fd-4a12-b72b-966981d385ef,2024-08-01 23:58:13.316


In [234]:
### Remove row where session_id is '4b8bb159-88fd-4a12-b72b-966981d385ef'
chat_pre_task = chat_pre_task[chat_pre_task['session_id']!='4b8bb159-88fd-4a12-b72b-966981d385ef']

In [235]:
get_data_by_pid(chat_post_task,6)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,support_effective,support_helpful,support_beneficial,support_adequate,support_sensitive,support_caring,support_understanding,support_supportive,client_id,session_id,timestamp
44,66ac1d670c9a6765555010ad,3,3,3,3,5,2,2,2,2,2,2,1,1,2,2,3e1f2059-4407-494c-be55-6eac76299913,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:42:31.323
45,66ac1f640c9a6765555010b3,-1,-1,-1,4,4,-1,1,1,1,1,1,1,1,1,1,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:51:00.912
46,66ac1f640c9a6765555010b4,-1,-1,-1,4,4,-1,1,1,1,1,1,1,1,1,1,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:51:00.977
47,66ac1f650c9a6765555010b5,-1,-1,-1,4,4,-1,1,1,1,1,1,1,1,1,1,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:51:01.050
48,66ac1f690c9a6765555010b6,-1,-1,-1,4,4,-1,1,1,1,1,1,1,1,1,1,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:51:05.812
49,66ac28df0c9a6765555010ca,-2,-2,-2,4,4,-2,1,0,1,1,1,0,0,1,-1,31633a46-f92e-4ecb-be08-580fb09eb876,4b8bb159-88fd-4a12-b72b-966981d385ef,2024-08-02 00:31:27.861
50,66ac300ef122a1c9aea2d005,-2,-2,-2,4,4,-2,1,0,1,1,1,0,0,1,-1,31633a46-f92e-4ecb-be08-580fb09eb876,4b8bb159-88fd-4a12-b72b-966981d385ef,2024-08-02 01:02:06.196


In [236]:
'''
for each client_id in chat_post_task
keep row with earliest timestamp
(additional rows are erroneous duplicates)
'''
chat_post_task = chat_post_task.sort_values('timestamp').drop_duplicates(['client_id'])

In [237]:
get_data_by_pid(chat_post_task,6)

Unnamed: 0,_id,interaction_polite,interaction_dignity,interaction_respect,cognitive_demands,cognitive_resources,affect_valence,affect_arousal,support_effective,support_helpful,support_beneficial,support_adequate,support_sensitive,support_caring,support_understanding,support_supportive,client_id,session_id,timestamp
44,66ac1d670c9a6765555010ad,3,3,3,3,5,2,2,2,2,2,2,1,1,2,2,3e1f2059-4407-494c-be55-6eac76299913,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:42:31.323
45,66ac1f640c9a6765555010b3,-1,-1,-1,4,4,-1,1,1,1,1,1,1,1,1,1,7d1e680e-7a9f-4f9e-a3cf-295cd4f9cfa8,55238f70-fc6f-457b-8d3f-8a1679dc77e3,2024-08-01 23:51:00.912
49,66ac28df0c9a6765555010ca,-2,-2,-2,4,4,-2,1,0,1,1,1,0,0,1,-1,31633a46-f92e-4ecb-be08-580fb09eb876,4b8bb159-88fd-4a12-b72b-966981d385ef,2024-08-02 00:31:27.861


In [238]:
chat_post_task = replace_client_id(chat_post_task, '31633a46-f92e-4ecb-be08-580fb09eb876' , '0571ce3a-d4b7-430d-8c70-f2ac6c6f42da')

## All Cases

In [239]:
chat_client_info = chat_client_info[['session_id','client_id','client_name','civil','emo','timestamp']]

In [240]:
chat_client_info = join_to_participant_data(participant_log, chat_client_info)
chat_history = join_to_participant_data(participant_log, chat_history)
chat_in_task = join_to_participant_data(participant_log, chat_in_task)
chat_post_task = join_to_participant_data(participant_log, chat_post_task)
chat_pre_task = join_to_participant_data(participant_log, chat_pre_task)

In [241]:
'''
for each client_id in chat_history
if turn_number when sender is "representative" == turn_number when sender is "client"
keep client_id
'''
chat_history_turns = chat_history.groupby(['client_id','turn_number']).sender.nunique().reset_index()
chat_history_turns = chat_history_turns[chat_history_turns['sender']==2]

chat_history = pd.merge(chat_history, chat_history_turns, on=['client_id','turn_number'], how='inner')

In [242]:
'''
Only keep data when client has max turns greater than 2
'''
max_turns = chat_history.groupby(['client_id']).turn_number.max()
client_ids_w_max_turns = max_turns[max_turns>=2].index

chat_history = chat_history[chat_history['client_id'].isin(client_ids_w_max_turns)]
chat_in_task = chat_in_task[chat_in_task['client_id'].isin(client_ids_w_max_turns)]
chat_post_task = chat_post_task[chat_post_task['client_id'].isin(client_ids_w_max_turns)]
chat_client_info = chat_client_info[chat_client_info['client_id'].isin(client_ids_w_max_turns)]

In [248]:
'''
Group chat_client_info by pid
remove client_id with earliest timestamp
'''
pid_no_practice = [2]
clients_simulation = chat_client_info.groupby(['pid']).apply(lambda x: x.sort_values('timestamp').iloc[1:] if x.name not in pid_no_practice else x, include_groups=False).client_id.unique().tolist()

In [250]:
chat_history = chat_history[chat_history['client_id'].isin(clients_simulation)]
chat_in_task = chat_in_task[chat_in_task['client_id'].isin(clients_simulation)]
chat_post_task = chat_post_task[chat_post_task['client_id'].isin(clients_simulation)]
chat_client_info = chat_client_info[chat_client_info['client_id'].isin(clients_simulation)]