In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
#Read the datafile
seqlearn_table = pd.read_csv("10_subject_data.csv")
seqlearn_table.head(5)

Unnamed: 0,Event Index,UTC Timestamp,UTC Date,Local Timestamp,Local Timezone,Local Date,Experiment ID,Experiment Version,Tree Node Key,Repeat Key,...,video_fname,ANSWER,tour_string,trial_number,phase_id,stim_type,mascot_name,action_bare,video_context,action_number
0,1,1614121000000.0,23/02/2021 22:48:30,1614121000000.0,-5.0,23/02/2021 17:48:29,43256.0,6.0,task-z7dv,,...,,,,,,,,,,
1,2,1614121000000.0,23/02/2021 22:51:37,1614121000000.0,-5.0,23/02/2021 17:51:36,43256.0,6.0,task-z7dv,,...,none,,,0.0,overall_instructions,none,none,none,none,0.0
2,3,1614121000000.0,23/02/2021 22:51:38,1614121000000.0,-5.0,23/02/2021 17:51:37,43256.0,6.0,task-z7dv,,...,none,,,0.0,overall_instructions,none,none,none,none,0.0
3,4,1614121000000.0,23/02/2021 22:51:39,1614121000000.0,-5.0,23/02/2021 17:51:38,43256.0,6.0,task-z7dv,,...,none,,,0.0,overall_instructions,none,none,none,none,0.0
4,5,1614121000000.0,23/02/2021 22:51:41,1614121000000.0,-5.0,23/02/2021 17:51:40,43256.0,6.0,task-z7dv,,...,none,,,0.0,overall_instructions,none,none,none,none,0.0


In [6]:
# Remove white spaces in the column names and make them into strings
seqlearn_table.columns = seqlearn_table.columns.str.replace(' ', '_')
seqlearn_table.columns

Index(['Event_Index', 'UTC_Timestamp', 'UTC_Date', 'Local_Timestamp',
       'Local_Timezone', 'Local_Date', 'Experiment_ID', 'Experiment_Version',
       'Tree_Node_Key', 'Repeat_Key', 'Schedule_ID', 'Participant_Public_ID',
       'Participant_Private_ID', 'Participant_Starting_Group',
       'Participant_Status', 'Participant_Completion_Code',
       'Participant_External_Session_ID', 'Participant_Device_Type',
       'Participant_Device', 'Participant_OS', 'Participant_Browser',
       'Participant_Monitor_Size', 'Participant_Viewport_Size', 'Checkpoint',
       'Task_Name', 'Task_Version', 'checkpoint-afi1', 'checkpoint-z3dm',
       'checkpoint-na1f', 'counterbalance-cx13', 'checkpoint-ox9t',
       'Spreadsheet', 'Spreadsheet_Name', 'Spreadsheet_Row', 'Trial_Number',
       'Screen_Number', 'Screen_Name', 'Zone_Name', 'Zone_Type',
       'Reaction_Time', 'Reaction_Onset', 'Response_Type', 'Response',
       'Attempt', 'Correct', 'Incorrect', 'Dishonest', 'X_Coordinate',
       '

In [7]:
#selecting relevant columns
df = seqlearn_table.loc[:, ["phase_id", "mascot_name", "video_context", "Response", 'stim_type']]

#dropping all columns except recog_random and recog_ordered
data_recog = df[(df.phase_id == "recog_random") | (df.phase_id == "recog_ordered")]

data_recog

Unnamed: 0,phase_id,mascot_name,video_context,Response,stim_type
185,recog_random,Kounudoun,performance,,studied
186,recog_random,Kounudoun,performance,sure old,studied
187,recog_random,Kounudoun,performance,,studied
188,recog_random,Jarbo,alley,sure old,studied
189,recog_random,Jarbo,alley,,studied
...,...,...,...,...,...
9383,recog_random,Jarbo,roof,maybe old,studied
9384,recog_random,Jarbo,roof,,studied
9385,recog_random,Jarbo,ammusement-ride-2,,studied
9386,recog_random,Jarbo,ammusement-ride-2,maybe old,studied


In [8]:
# Dropping NaN values
clean_data_recog = data_recog.dropna()
clean_data_recog
print(clean_data_recog.dtypes)

phase_id         object
mascot_name      object
video_context    object
Response         object
stim_type        object
dtype: object


In [9]:
# look at the number of different kinds of responses across participants
test = clean_data_recog.groupby(['video_context', 'stim_type'])['Response'].value_counts()
test.head(60)

video_context      stim_type  Response 
3/1/01             lure       guess new     1
                              sure old      1
                   studied    maybe old     2
                              guess new     1
                              guess old     1
                              maybe new     1
                              sure new      1
3/2/01             lure       sure new      3
                              guess new     2
                              guess old     2
                   studied    maybe old     1
CPR                lure       sure new      7
                              maybe old     2
                   studied    sure old      3
                              maybe old     2
                              guess old     1
alley              lure       maybe new     2
                              sure new      1
                   studied    maybe old     2
                              sure old      2
                              guess new 

* Determine whether response is a Hit, False Alarm, Correct rejection, or Incorrect rejection
* create an empty column
* when studied and sure/maybe/guess old, = HIT in a new column
* when studied and sure/maybe/guess new, = False alarm in a new column
* when lure and sure/maybe/guess old, = False alarm in a new column
* alternatively, could this become a boolean mask where TRUE = hit?
* calculate total responses per video

In [11]:
# When stim_type = studied and response is sure/maybe/guess old, add value 'hit' in the same row in a new column

recog_test_data = pd.DataFrame(clean_data_recog, columns = ['Response', 'stim_type', 'video_context', 'response_type'])
recog_test_data

Unnamed: 0,Response,stim_type,video_context,response_type
186,sure old,studied,performance,
188,sure old,studied,alley,
192,sure new,lure,hallway,
194,sure new,lure,street-fair-1,
197,sure old,studied,outside-building,
...,...,...,...,...
9374,sure old,studied,outside-store,
9377,sure old,studied,standing-behind-cake,
9380,sure old,studied,park-with-audience,
9383,maybe old,studied,roof,


In [12]:
recog_test_data['response_type'] = np.where((recog_test_data['stim_type'] == 'studied') & ((recog_test_data['Response'] == 'sure old') | (recog_test_data['Response'] == 'maybe old') | (recog_test_data['Response'] == 'guess old')), True, False)
recog_test_data

Unnamed: 0,Response,stim_type,video_context,response_type
186,sure old,studied,performance,True
188,sure old,studied,alley,True
192,sure new,lure,hallway,False
194,sure new,lure,street-fair-1,False
197,sure old,studied,outside-building,True
...,...,...,...,...
9374,sure old,studied,outside-store,True
9377,sure old,studied,standing-behind-cake,True
9380,sure old,studied,park-with-audience,True
9383,maybe old,studied,roof,True


In [13]:
# Add hits to the new column
recog_test_data['response_type'] = recog_test_data['response_type'].replace(True, 'Hit')
recog_test_data['response_type'] = recog_test_data['response_type'].replace(False, 'Miss/FA/CR')
recog_test_data

Unnamed: 0,Response,stim_type,video_context,response_type
186,sure old,studied,performance,Hit
188,sure old,studied,alley,Hit
192,sure new,lure,hallway,Miss/FA/CR
194,sure new,lure,street-fair-1,Miss/FA/CR
197,sure old,studied,outside-building,Hit
...,...,...,...,...
9374,sure old,studied,outside-store,Hit
9377,sure old,studied,standing-behind-cake,Hit
9380,sure old,studied,park-with-audience,Hit
9383,maybe old,studied,roof,Hit


In [14]:
hit_vals = recog_test_data.groupby(['video_context'])['response_type'].value_counts()
hit_vals

video_context  response_type
3/1/01         Miss/FA/CR       5
               Hit              3
3/2/01         Miss/FA/CR       7
               Hit              1
CPR            Miss/FA/CR       9
                               ..
workout        Hit              1
zoo-1          Miss/FA/CR       4
               Hit              2
zoo-2          Miss/FA/CR       7
               Hit              2
Name: response_type, Length: 376, dtype: int64

In [15]:
# total number of responses per video 
print(clean_data_recog['video_context'].value_counts())

exercise-class            80
announcers                35
beach                     35
park                      34
dance-with-large-group    31
                          ..
town                       3
pig-pen                    3
farm                       3
walking-in-street          3
spinning-soccer-ball       2
Name: video_context, Length: 194, dtype: int64


In [16]:
# can we find a way to put both of these values into a dataframe so that we can calculate hit rate?

hit = pd.DataFrame(recog_test_data.groupby(['video_context'])['response_type'].value_counts(), columns = ['response_type', 'resp_type_total'])
hit
# maybe I could convert the value counts of video context into their own series and then incorperate them as the next column. 

Unnamed: 0_level_0,Unnamed: 1_level_0,response_type,resp_type_total
video_context,response_type,Unnamed: 2_level_1,Unnamed: 3_level_1
3/1/01,Miss/FA/CR,5,
3/1/01,Hit,3,
3/2/01,Miss/FA/CR,7,
3/2/01,Hit,1,
CPR,Miss/FA/CR,9,
...,...,...,...
workout,Hit,1,
zoo-1,Miss/FA/CR,4,
zoo-1,Hit,2,
zoo-2,Miss/FA/CR,7,


In [17]:
# for every value corresponding to a unique video context, add to the resp_type_total column the sum

In [18]:
# https://stackoverflow.com/questions/30244952/how-do-i-create-a-new-column-from-the-output-of-pandas-groupby-sum
hit['resp_type_total'] = hit['response_type'].transform('sum')

ValueError: Function did not transform

In [19]:
data = pd.DataFrame(clean_data_recog, columns = ['Response', 'stim_type', 'video_context', 'response_type'])
recog_test_data

Unnamed: 0,Response,stim_type,video_context,response_type
186,sure old,studied,performance,Hit
188,sure old,studied,alley,Hit
192,sure new,lure,hallway,Miss/FA/CR
194,sure new,lure,street-fair-1,Miss/FA/CR
197,sure old,studied,outside-building,Hit
...,...,...,...,...
9374,sure old,studied,outside-store,Hit
9377,sure old,studied,standing-behind-cake,Hit
9380,sure old,studied,park-with-audience,Hit
9383,maybe old,studied,roof,Hit
