In [2]:
# Added cell to set Working Directory to your location
import os
import re
import glob
import numpy as np
import pandas as pd
import json
import time

In [2]:
os.chdir("/Volumes/SSD/00_Data_Processing/Exploration_short")
os.getcwd()

'/Volumes/SSD/00_Data_Processing/Exploration_short'

In [3]:
# Columns to keep from the raw data.
cols_to_keep = ['timeStampDataPointStart', 'timeStampDataPointEnd', 'combinedGazeValidityBitmask', 'rayCastHitsCombinedEyes', 'hmdPosition.x', 'hmdPosition.y',
                'hmdPosition.z', 'hmdDirectionForward.x', 'hmdDirectionForward.y', 'hmdDirectionForward.z', 'hmdDirectionRight.x', 'hmdDirectionRight.y',
                'bodyTrackerRotation.x', 'bodyTrackerRotation.y','bodyTrackerRotation.z','playerBodyPosition.x', 'playerBodyPosition.y', 'playerBodyPosition.z']

In [4]:
data_path = '/Volumes/SSD/00_Data_Processing/Exploration_short'
processed_data = '/Volumes/SSD/00_Data_Processing/Pre_processed'

# Getting the Folder without hidden files in ascending order
DATA_FOLDER = sorted([f for f in os.listdir(data_path) if not f.startswith('.')], key=str.lower)
PROCESSED_DATA_FOLDER = sorted([f for f in os.listdir(processed_data) if not f.startswith('.')], key=str.lower)
subIDs = []
for sub in DATA_FOLDER:
    if sub[0:4].isdigit():
        subIDs.append(int(sub[0:4]))
    else:
        pass
subIDs = np.unique(subIDs)
#Sincesome participant IDs start with 0, we format them to show it in the string type
IDstrings = ['{:04d}'.format(id) for id in subIDs]
print(IDstrings)

['8673', '8695', '9472']


In [5]:
#Create a generalized path for all json files per participant
paths = [ID +  "/*.json" for ID in IDstrings]
print(paths)
#Create a sorted list of the paths to open de jsons
Sorted_individual_jsons = sorted([filename for path in paths for filename in glob.glob(path)], key=str.lower)

['8673/*.json', '8695/*.json', '9472/*.json']


In [6]:
data_raw = pd.DataFrame()

# read every file name in folder
for path in paths:
    for filename in glob.glob(path):
        with open(filename, 'r') as file:
            try:
                # make json files parsable
                data = "[" + file.read()
                data = data[:len(data)] + "]"
                raw = json.loads(data)
            except:
                print("reading did not work")
                

            # Uneast the higher level of each file
            currentDF_raw = pd.json_normalize(raw[0]['trials'][0]['dataPoints'])
            print( "Subject " + str(filename[5:9]) + " Session " + str(filename[17:19]) +" Section " + str(filename[23:24]) + " has been normalized")
            #Reduce columns to just necessary information
            reduced_data = currentDF_raw.loc[:, cols_to_keep]
            print('time is: ', time.ctime())

            # insert participant id and session information from the file name
            reduced_data.insert(0, "SubjectID", [int(filename[5:9])] * reduced_data.shape[0], True)
            reduced_data.insert(1, "Session", [int(filename[17:19])] * reduced_data.shape[0], True)
            reduced_data.insert(2, "SessionSubsection", [int(filename[23:24])] * reduced_data.shape[0], True)
        data_raw = data_raw.append(reduced_data, ignore_index=True)
        print('Appended')
    data_raw.to_csv('/Volumes/SSD/00_Data_Processing/Pre_processed/00_Individuals_Flat/' + str(filename[5:9]) + ".csv")
    print('Saved')
    data_raw = pd.DataFrame()

Subject 8673 Session 05 Section 3 has been normalized
time is:  Wed Nov 30 13:12:48 2022
Appended
Subject 8673 Session 01 Section 1 has been normalized
time is:  Wed Nov 30 13:13:31 2022
Appended
Subject 8673 Session 01 Section 2 has been normalized
time is:  Wed Nov 30 13:14:19 2022
Appended
Subject 8673 Session 01 Section 3 has been normalized
time is:  Wed Nov 30 13:15:03 2022
Appended
Subject 8673 Session 02 Section 1 has been normalized
time is:  Wed Nov 30 13:15:49 2022
Appended
Subject 8673 Session 02 Section 2 has been normalized
time is:  Wed Nov 30 13:16:37 2022
Appended
Subject 8673 Session 02 Section 3 has been normalized
time is:  Wed Nov 30 13:17:25 2022
Appended
Subject 8673 Session 03 Section 1 has been normalized
time is:  Wed Nov 30 13:18:12 2022
Appended
Subject 8673 Session 03 Section 2 has been normalized
time is:  Wed Nov 30 13:19:04 2022
Appended
Subject 8673 Session 03 Section 3 has been normalized
time is:  Wed Nov 30 13:19:57 2022
Appended
Subject 8673 Session

In [9]:
exploded = reduced_data['rayCastHitsCombinedEyes'].explode().apply(pd.Series)

In [3]:
filenames = glob.glob("/Volumes/SSD/00_Data_Processing/Pre_processed/00_Individuals_Flat/*.csv")                    
combined_csv = pd.concat( [ pd.read_csv(f) for f in filenames ] )

In [6]:
combined_csv.to_csv("/Volumes/SSD/00_Data_Processing/Pre_processed/combined_csv.csv")

In [4]:
combined_csv.head()

Unnamed: 0.1,Unnamed: 0,SubjectID,Session,SessionSubsection,timeStampDataPointStart,timeStampDataPointEnd,combinedGazeValidityBitmask,rayCastHitsCombinedEyes,hmdPosition.x,hmdPosition.y,...,hmdDirectionForward.y,hmdDirectionForward.z,hmdDirectionRight.x,hmdDirectionRight.y,bodyTrackerRotation.x,bodyTrackerRotation.y,bodyTrackerRotation.z,playerBodyPosition.x,playerBodyPosition.y,playerBodyPosition.z
0,0,479,5,3,1653464000.0,1653464000.0,3,[{'hitPointOnObject': {'x': -73.16242218017578...,-94.648758,-0.40934,...,0.027602,0.958554,0.958661,0.016425,0.0,0.0,0.0,-94.788467,-1.957432,-138.283386
1,1,479,5,3,1653464000.0,1653464000.0,3,[{'hitPointOnObject': {'x': -73.16242218017578...,-94.648758,-0.40934,...,0.027602,0.958554,0.958661,0.016425,0.0,0.0,0.0,-94.788467,-1.957432,-138.283386
2,2,479,5,3,1653464000.0,1653464000.0,3,[{'hitPointOnObject': {'x': -72.61051940917969...,-94.648758,-0.409452,...,0.026794,0.958753,0.958849,0.016187,0.0,0.0,0.0,-94.788467,-1.957432,-138.283386
3,3,479,5,3,1653464000.0,1653464000.0,3,[{'hitPointOnObject': {'x': -72.61051940917969...,-94.648758,-0.409452,...,0.026794,0.958753,0.958849,0.016187,0.0,0.0,0.0,-94.788467,-1.957432,-138.283386
4,4,479,5,3,1653464000.0,1653464000.0,3,[{'hitPointOnObject': {'x': -72.95903778076172...,-94.648819,-0.409578,...,0.024862,0.959344,0.95941,0.015804,0.0,0.0,0.0,-94.788467,-1.957432,-138.283386


In [5]:
combined_csv.shape

(26914800, 22)

In [10]:
emptyDF1 = pd.DataFrame(np.nan,index=[0], columns= columns1)
emptyDF2 = pd.DataFrame(np.nan,index=[0], columns= columns2)

NameError: name 'columns1' is not defined

In [None]:
# data loop through all subjects and sessions
subcount = 0
for subject in subIDs:
    subcount +=1
    print('Subject '
          + str(subject)
          + ' started - '
          + str(subcount)
          + '/'
          + str(len(subIDs))
          + ' subjects')
    #     # Create empty dataframe for later concatenation
    # complete_exploration_df = pd.DataFrame(columns = col_names)
    #     complete_exploration_df.head()


    # change dir into the subject folder
    CURRENT_SUBJECT_FOLDER = sorted([f for f in os.listdir(DATA_PATH+str(subject))], key=str.lower)
    # get the data files according to the subject, ignoring OnQuit files
    subject_files = sorted([f for f in CURRENT_SUBJECT_FOLDER
                            if f.startswith(str(subject)+'_Expl_S_') and f.endswith("OnQuit.json") == False],
                           key=str.lower)

    # the following works as long as the data name format is as follows:
    # 'subjectID'_Expl_S_'SessionNumber'_ET_'EyeTrackingSessionNumber'_'UnixTimestamp'.json
    folder_files = list()

    # loop through the subject folder and save all numbers
    for file in subject_files:
        folder_files.append(re.findall(r'\d+', file))

    # Extract all SubIDs (only one), SessionNumbers, ET_SessionNumbers (and Timestamps)
    try:
        SubID, SessionNumbers, ET_SessionNumbers, UnixTimestamp1, UnixTimeStamp2 = map(list, zip(*folder_files))
    except:
        print('\tSubject '
              + str(subject)
              + ' Filename is not valid!')

    #     print(SubID)
    #     print(SessionNumbers)
    #     print(ET_SessionNumbers)
    #     print(UnixTimestamp1)
    #     print(UnixTimeStamp2)

    session_number = int(max(SessionNumbers)) # the maximum session number of the particular subject
    ET_session_number = int(max(ET_SessionNumbers)) # the maximum ET session number of the particular subject


    # print info of how many files were found

    print(len(SubID), ' files were found for participant ', SubID[0])
    print('A maximum of ', session_number, 'sessions were found and will be processed')

    # --------- second layer - exploration session loop ---------

    # loop over exploration sessions
    for EXP_session in range(session_number):
        # to avoid start at 0
        EXP_session +=1

        # extract the exploration data files for each session - but exclude OnQuit files
        subject_data = sorted([f for f in CURRENT_SUBJECT_FOLDER if f.startswith(str(subject) + '_Expl_S_')
                               and f.endswith("OnQuit.json") == False], key=str.lower)


        print("\tTotal Sessionfiles: "
              + str(len(subject_data))
              + " - Exploration Session "
              + str(EXP_session))

        ET_session_count = 0 # session count
        # --------- third layer - eye tracking session loop ---------

        # loop over separate eye tracking sessions
        for fileName in subject_data:
            ET_session_count+=1

            print('load data of file ', fileName)

            print('Path: ', DATA_PATH + str(subject) + '/' + fileName)
            # open the JSON file as dictionary
            with open(DATA_PATH + str(subject) + '/' + fileName) as datafile:
                try:
                    print("read file")
                    dataR = '['+ datafile.read()
                    dataR = dataR[:len(dataR)] + "]"
                except:
                    print("reading did not work")

                subject_session = json.loads(dataR)
                print("data loaded")
                print('time is: ', time.ctime())



            ##################################################################################################################

            # Data flattening part:
            # first save the overall trial information


            infoDF = pd.json_normalize(subject_session[0]['trials'][0])
            infoDF = infoDF.drop(columns=['dataPoints'])
            infoDF.insert(0,'FileInfo',fileName[0:18])
            infoDF.to_csv(PROCESSED_DATA_PATH + fileName[0:18] + '_infoSummary.csv', index = False)
            print('trial info saved')


            # flatten the majority of the variables into currentDF data frame
            currentDF_raw = pd.json_normalize(subject_session[0]['trials'][0]['dataPoints'])

            # remove the 'rayCastHitsCombinedEyes' column as it still contains a nested data structure
            dataDF = currentDF_raw.drop(columns=['rayCastHitsCombinedEyes'])

            # create an empty data frame of the required size
            rayCastData_df = pd.DataFrame(np.nan,index=range(len(subject_session[0]['trials'][0]['dataPoints'])), columns= columnsRCall)

            # now loop through the individual trials and flatten the data
            for index in range(len(subject_session[0]['trials'][0]['dataPoints'])):
                # depending on the size of the ray cast data - flatten data and appand it to currentDF data frame
                # the variables are renamed to make the differentiation of first and second order collider hits more intuitive
                #lengthRCData = len(subject_session[0]['trials'][0]['dataPoints'][index]['rayCastHitsCombinedEyes'][0])
                lengthRCData = len(currentDF_raw.at[index,'rayCastHitsCombinedEyes'])


                if lengthRCData ==0: #case: no ray cast data is available = no collider was hit

                    combineDF = pd.concat([emptyDF1, emptyDF2], axis=1)
                    combineDF.insert(len(combineDF.columns), 'DataRow',index)


                elif lengthRCData == 1: # case: only one collider was hit, there is no secondary hit

                    pdRC1= pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][0]).rename(
                        columns = {'hitObjectColliderName':'hitObjectColliderName_1',
                                   'hitColliderType':'hitColliderType_1',
                                   'ordinalOfHit':'ordinalOfHit_1',
                                   'hitPointOnObject.x':'hitPointOnObject.x_1',
                                   'hitPointOnObject.y':'hitPointOnObject.y_1',
                                   'hitPointOnObject.z':'hitPointOnObject.z_1',
                                   'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_1',
                                   'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_1',
                                   'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_1'})
                    combineDF = pd.concat([pdRC1, emptyDF2], axis=1)
                    combineDF.insert(len(combineDF.columns), 'DataRow',index)

                elif lengthRCData == 2: # case: two collider were hit

                    pdRC1= pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][0]).rename(
                        columns = {'hitObjectColliderName':'hitObjectColliderName_1',
                                   'hitColliderType':'hitColliderType_1',
                                   'ordinalOfHit':'ordinalOfHit_1',
                                   'hitPointOnObject.x':'hitPointOnObject.x_1',
                                   'hitPointOnObject.y':'hitPointOnObject.y_1',
                                   'hitPointOnObject.z':'hitPointOnObject.z_1',
                                   'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_1',
                                   'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_1',
                                   'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_1'})
                    pdRC2 = pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][1]).rename(
                        columns = {'hitObjectColliderName':'hitObjectColliderName_2',
                                   'hitColliderType':'hitColliderType_2',
                                   'ordinalOfHit':'ordinalOfHit_2',
                                   'hitPointOnObject.x':'hitPointOnObject.x_2',
                                   'hitPointOnObject.y':'hitPointOnObject.y_2',
                                   'hitPointOnObject.z':'hitPointOnObject.z_2',
                                   'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_2',
                                   'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_2',
                                   'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_2'})
                    combineDF = pd.concat([pdRC1, pdRC2], axis=1)
                    combineDF.insert(len(combineDF.columns), 'DataRow',index)


                else:
                    print('!!!an exception occured in the ray cast data flattening in trial ', index)

                # now add the new data row to the data overview
                # rayCastData_df = [rayCastData_df]


                rayCastData_df.loc[index] = combineDF.loc[0]

            flatData_df = pd.concat([dataDF,rayCastData_df],axis=1)

            print('saving data')
            flatData_df.to_csv(PROCESSED_DATA_PATH + fileName[0:18] + '_flattened.csv', index = False)
            print('data saved')
            print('time is: ', time.ctime())

In [None]:
currentDF_raw.rayCastHitsCombinedEyes[0]

In [None]:
type(CURRENT_SUBJECT_FOLDER)

In [None]:
len(CURRENT_SUBJECT_FOLDER)

In [None]:
for file in CURRENT_SUBJECT_FOLDER:
    CURRENT_SUBJECT_FOLDER.append(re.findall(r'\d+', file))

    # Extract all SubIDs (only one), SessionNumbers, ET_SessionNumbers (and Timestamps)
try:
    SubID, SessionNumbers, ET_SessionNumbers, UnixTimestamp1, UnixTimeStamp2 = map(list, zip(*folder_files))
except:
    print('\tSubject '
          + str(subject)
          + ' Filename is not valid!')

In [None]:
 subject_files = sorted([file for file in CURRENT_SUBJECT_FOLDER if file.startswith(str(subject)+'_Expl_S_') and file.endswith("OnQuit.json") == False],
                        key=str.lower)

# the following works as long as the data name format is as follows:
# 'subjectID'_Expl_S_'SessionNumber'_ET_'EyeTrackingSessionNumber'_'UnixTimestamp'.json
folder_files = list()

# loop through the subject folder and save all numbers
for file in subject_files:
    folder_files.append(re.findall(r'\d+', file))

# Extract all SubIDs (only one), SessionNumbers, ET_SessionNumbers (and Timestamps)
try:
    SubID, SessionNumbers, ET_SessionNumbers, UnixTimestamp1, UnixTimeStamp2 = map(list, zip(*folder_files))
except:
    print('\tSubject '
          + str(subject)
          + ' Filename is not valid!')

In [None]:
session_number = int(max(SessionNumbers)) # the maximum session number of the particular subject
ET_session_number = int(max(ET_SessionNumbers)) # the maximum ET session number of the particular subject


# print info of how many files were found

print(len(SubID), ' files were found for participant ', SubID[0])
print('A maximum of ', session_number, 'sessions were found and will be processed')

In [None]:
OneFile = "/Volumes/SSD/Test_Exploration/9502/9502_Expl_S_01_ET_1_1641807659.90466.json"
with open(OneFile) as datafile:
    try:
        print("read file")
        dataR = '['+ datafile.read()
        dataR = dataR[:len(dataR)] + "]"
    except:
        print("reading did not work")

    subject_session = json.loads(dataR)

currentDF_raw = pd.json_normalize(subject_session[0]['trials'][0]['dataPoints'])

In [None]:
currentDF_raw.rayCastHitsCombinedEyes

In [None]:
currentDF_raw.columns

In [None]:
rayCastData_df = pd.DataFrame(np.nan,index=range(len(subject_session[0]['trials'][0]['dataPoints'])), columns= columnsRCall)

In [None]:
# flatten the majority of the variables into currentDF data frame
currentDF_raw = pd.json_normalize(subject_session[0]['trials'][0]['dataPoints'])

# remove the 'rayCastHitsCombinedEyes' column as it still contains a nested data structure
dataDF = currentDF_raw.drop(columns=['rayCastHitsCombinedEyes'])

# create an empty data frame of the required size
rayCastData_df = pd.DataFrame(np.nan,index=range(len(subject_session[0]['trials'][0]['dataPoints'])), columns= columnsRCall)

# now loop through the individual trials and flatten the data
for index in range(len(subject_session[0]['trials'][0]['dataPoints'])):
    # depending on the size of the ray cast data - flatten data and appand it to currentDF data frame
    # the variables are renamed to make the differentiation of first and second order collider hits more intuitive
    #lengthRCData = len(subject_session[0]['trials'][0]['dataPoints'][index]['rayCastHitsCombinedEyes'][0])
    lengthRCData = len(currentDF_raw.at[index,'rayCastHitsCombinedEyes'])


    if lengthRCData ==0: #case: no ray cast data is available = no collider was hit

        combineDF = pd.concat([emptyDF1, emptyDF2], axis=1)
        combineDF.insert(len(combineDF.columns), 'DataRow',index)


    elif lengthRCData == 1: # case: only one collider was hit, there is no secondary hit

        pdRC1= pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][0]).rename(
            columns = {'hitObjectColliderName':'hitObjectColliderName_1',
                       'ordinalOfHit':'ordinalOfHit_1',
                       'hitPointOnObject.x':'hitPointOnObject.x_1',
                       'hitPointOnObject.y':'hitPointOnObject.y_1',
                       'hitPointOnObject.z':'hitPointOnObject.z_1',
                       'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_1',
                       'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_1',
                       'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_1'})
        combineDF = pd.concat([pdRC1, emptyDF2], axis=1)
        combineDF.insert(len(combineDF.columns), 'DataRow',index)

    elif lengthRCData == 2: # case: two collider were hit

        pdRC1= pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][0]).rename(
            columns = {'hitObjectColliderName':'hitObjectColliderName_1',
                       'ordinalOfHit':'ordinalOfHit_1',
                       'hitPointOnObject.x':'hitPointOnObject.x_1',
                       'hitPointOnObject.y':'hitPointOnObject.y_1',
                       'hitPointOnObject.z':'hitPointOnObject.z_1',
                       'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_1',
                       'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_1',
                       'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_1'})
        pdRC2 = pd.json_normalize(currentDF_raw.at[index,'rayCastHitsCombinedEyes'][1]).rename(
            columns = {'hitObjectColliderName':'hitObjectColliderName_2',
                       'ordinalOfHit':'ordinalOfHit_2',
                       'hitPointOnObject.x':'hitPointOnObject.x_2',
                       'hitPointOnObject.y':'hitPointOnObject.y_2',
                       'hitPointOnObject.z':'hitPointOnObject.z_2',
                       'hitObjectColliderBoundsCenter.x':'hitObjectColliderBoundsCenter.x_2',
                       'hitObjectColliderBoundsCenter.y':'hitObjectColliderBoundsCenter.y_2',
                       'hitObjectColliderBoundsCenter.z':'hitObjectColliderBoundsCenter.z_2'})
        combineDF = pd.concat([pdRC1, pdRC2], axis=1)
    else:
        print('!!!an exception occured in the ray cast data flattening in trial ', index)

        # now add the new data row to the data overview
        # rayCastData_df = [rayCastData_df]


    rayCastData_df.loc[index] = combineDF.loc[0]

    flatData_df = pd.concat([dataDF,rayCastData_df],axis=1)

In [None]:
combineDF

In [None]:
rayCastData_df

In [None]:
reduced_data = currentDF_raw.loc[:, cols_to_keep]

In [None]:
reduced_data

In [None]:
reduced_data.explode('rayCastHitsCombinedEyes', ignore_index=True)

In [None]:
reduced_data.rayCastHitsCombinedEyes[0]

In [None]:
exploded = reduced_data['rayCastHitsCombinedEyes'].explode().apply(pd.Series)

In [None]:
exploded.groupby(['hitObjectColliderName', 'hitColliderType']).count()['ordinalOfHit']

In [None]:
exploded