In [None]:
import os
import re
import glob
import pandas as pd
import numpy as np
import warnings
import math
warnings.simplefilter(action='ignore', category=FutureWarning)

# Extracting information from the randomization file

In [None]:
# Added cell to set Working Directory to your location
os.chdir('/Users/tracysanchezpacheco/Documents/') 
os.getcwd()

## Importing the individual randomization files per participant

In [None]:
## Extracting all subject IDs from the data folder
subIDs = []
for sub in glob.glob(path):
    if sub[15].isdigit():
        subIDs.append(int(sub[15:19]))
    else:
        pass
subIDs = np.unique(subIDs)
print(subIDs)

In [None]:
# create empty dataframe
df = pd.DataFrame()

# read every file name in folder
for filename in glob.glob(path):
    with open(filename, 'r') as file:
        # make json files parsable
        data = "[" + file.read()
        data = data[:len(data)-1] + "]"

        # read data per file
        subjectdf = pd.read_json(data)

        # insert participant id in every line
        subjectdf.insert(0, "SubjectID", [int(filename[15:19])] * subjectdf.shape[0], True)
        df = df.append(subjectdf, ignore_index=True)
df.head(8)

In [None]:
df.tail(10)

## Detangling starting positions

`.explote('VariableName')` is a method that transforms each element of a list-like 
**in this case the list of start locations contained on** to a row, replicating index values

In [None]:
StartingPoints = pd.DataFrame(df.explode('PointingTaskStartingLocations', ignore_index=True))
StartingPoints.head(29)

In [None]:
StartingPoints.shape

In [None]:
StartingPoints = pd.DataFrame(df.explode('PointingTaskStartingLocations'))

### Creating StartingPoint ID

We have 28 starting locations, `np.arange(1,29,1)` creates a list starting at 1 and stopping at 28 in 1 unit incrementals and repeats that for the lenght of participants in the data base `len(StartingPoints.ParticipantID.unique()`.

In [None]:
StartingPoints['StartPointID']= np.tile(np.arange(1,29,1), len(StartingPoints.ParticipantID.unique()))
StartingPoints.head()

### Exploding the target places accordingly 

Since it's necessary to explote 28 diferent variables into specific row ranges, this process starts by creating a list of the future row range conditionals `ExploteID` and a list of the variables to explote `ColumnList`. 

This two list are then merge into a dictionary `ExploteDic`

In [None]:
# Create list of future conditionals for query
ExploteID = []
for item in range(1,29):
    step = 'StartPointID == '  + str(item)
    ExploteID.append(step)

In [None]:
# Create a list of the column names of variables to explote
ColumnList = list(StartingPoints.iloc[:,7:35].columns)

In [None]:
# Create dictionary 
ExploteDic = dict(zip(ExploteID, ColumnList))
ExploteDic

In [None]:
def explote_multiple(dataframe, dic):
    
    """ Creates a list of dataframes that include one exploted variable based row conditionals derivated from
    keys, values contain of a Dictionary
    
Parameters
-------------
     
     dataframe: your data frame,
     dic: Dictionary contains a list of key values that will inform a query of the df
     and values that contain the names of columns to explote. 
returns 
----------

    List of DataFrames
     """

    tempdf = [] #Empty list to store each exploted data set
    for conditional, variable_to_explote in dic.items():
        data_q = dataframe.query(conditional).explode(variable_to_explote)
        ques = pd.DataFrame(data_q)
        tempdf.append(ques)
    data_frames_list = pd.concat(tempdf, axis=0)
    return data_frames_list

In [None]:
ExploteAplication = explote_multiple(StartingPoints, ExploteDic)
ExploteAplication.shape

In [None]:
ExploteAplication.set_index(['ParticipantID', 'StartPointID'], inplace= True)

In [None]:
# Probe of concept how to just retain integers AKA the exploted data
b = pd.DataFrame( ExploteAplication['PointingTaskTargets_1'][ExploteAplication['PointingTaskTargets_1'].map(type)==int])
b.tail()

In [None]:
def integrate_explote(dataframe, list_variables):
    empty = []
    for l in  list_variables:
        step = dataframe[l][dataframe[l].map(type)==int]
        empty.append(step)
    finals = pd.DataFrame(pd.concat(empty, axis=0), columns= ['Target'])
    dataframe['Trials'] = finals['Target']
    return dataframe

In [None]:
Integrated = integrate_explote(ExploteAplication, ColumnList)
Integrated.info()
print(Integrated.tail())

In [None]:
Integrated.sort_index(level= ['ParticipantID'], inplace=True)
Integrated.head()

In [None]:
Targets = Integrated.loc[:,['SubjectID','PointingTaskStartingLocations','Trials']]
Targets.head()
print(Targets.info())

In [None]:
Final = Targets
Final['TrialID']= np.tile(np.arange(1,337,1), len(Final.SubjectID.unique()))
Final.head()

In [None]:
FinalIds = Targets.reset_index()
FinalIds.head()

# Reading the participants data from the pointingTask files

## Loading performance data

In [None]:
#Setting a new path for performance data
path = "Final/*.json"

In [None]:
# create empty dataframe
df = pd.DataFrame()

# read every file name in folder
for filename in glob.glob(path):
    with open(filename, 'r') as file:
        # make json files parsable
        data = "[" + file.read()
        data = data[:len(data)-2] + "]"

        # read data per file
        subjectdf = pd.read_json(data)

        # insert participant id in every line
        subjectdf.insert(0, "SubjectID", [int(filename[6:10])] * subjectdf.shape[0], True)
        df = df.append(subjectdf, ignore_index=True)
df

In [None]:
## Data Wrangling

### Avatar Categorical
In this section the variable `'ImageName'` will be desagrated into categorical variables tha provide information about the presence and nature of avatars in the trial.

  - ` extraRow ` function that creates a new variable that contains matching data from preexisting variable
  - `'AvatarPresence'` variable that shows if the trial image contained an Avatar
  - `'AvatarCategory'` variable that shows if the trial asked to point to a location that originally had an Action or Standing Avatar
  - ` 'avatarIDsf' ` variable that shows the ID of the Avatar placed on the location to which the participant has to point

In [None]:
def VariableExtraction (df,yourList, variable,newVariable):

    """ Creates a new variable that contains matching data from preexisting variable
    Parameters

    Parameters
    -------------

        df: your data frame,
        yourList: List of values you want to target for the partial match search
        variable: Variable that may contain the elements on yourlist
        newVariable: Name for  New Variable

    returns
    ----------
        DataFrame
    """

    for idx, row in df.iterrows():
        for l in yourList:
            if l in row[variable]:
                df.loc[idx, newVariable] = l

## Extracting the Avatar number from the variable 'ImageName'

In [None]:
AvaIDs = []
for idx, row in df.iterrows():
    num = re.findall(r'\d+', row['ImageName'])
    AvaIDs.append(num)
# Flatten the list: Every number was returned as a list of it's own with this we get one unified list
avatarIDsf = [item for sublist in AvaIDs for item in sublist]

In [None]:
#Searching for a partial string contained in the lists "AvatarIDsf" in variable "ImageName" to create a new variable "AvatarID"
VariableExtraction(df,avatarIDsf,'ImageName','avatarID')
df.head()

In [None]:
df['AvatarPresence'] = df['ImageName'].str.contains('No')
df.loc[:, 'AvatarPresenceCategory'] = df['AvatarPresence'].replace({True: 'NoAvatar', False: 'Avatar'})
df.head()

In [None]:
#Checking that value counts for Avatar presence [Should be 50/50]
df.AvatarPresenceCategory.value_counts()

In [None]:
df['meaningful'] = df['ImageName'].str.contains('CmA')
df.loc[:, 'meaningfulBuilding'] = df['meaningful'].replace({True: 'Meaningful', False: 'Not meaningful'})
df.head()

In [None]:
#Checking that value counts for Action and Standing avatars [Should be 50/50]
df.meaningfulBuilding.value_counts()

### Unesting variables

In [None]:
def unnest(dframe):

    """Unnest nested data that is needed from a JSON file by inserting each coordinates
    as a seperate column of the DataFrame.
    This Function is specific for the variable
Parameters
-------------
     dframe: your data frame
returns
----------
    DataFrame

    """

    TargetBuildingDirection = pd.DataFrame.from_records(dframe['TargetBuildingForward'])
    dframe.insert(25, "TargetBuildingDirection_z",TargetBuildingDirection['z'], True)
    dframe.insert(25, "TargetBuildingDirection_y",TargetBuildingDirection['y'], True)
    dframe.insert(25, "TargetBuildingDirection_x",TargetBuildingDirection['x'], True)

    TargetBuildingRotation = pd.DataFrame.from_records(dframe['TargetBuildingRotation'])
    dframe.insert(24, "TargetBuildingRotation_z",TargetBuildingRotation['z'], True)
    dframe.insert(24, "TargetBuildingRotation_y",TargetBuildingRotation['y'], True)
    dframe.insert(24, "TargetBuildingRotation_x",TargetBuildingRotation['x'], True)

    TargetBuildingPosition = pd.DataFrame.from_records(dframe['TargetBuildingPosition'])
    dframe.insert(23, "TargetBuildingPosition_z",TargetBuildingPosition['z'], True)
    dframe.insert(23, "TargetBuildingPosition_y",TargetBuildingPosition['y'], True)
    dframe.insert(23, "TargetBuildingPosition_x",TargetBuildingPosition['x'], True)

    PointerDirection = pd.DataFrame.from_records(dframe['PointerDirection'])
    dframe.insert(21, "PointerDirection_z",PointerDirection['z'], True)
    dframe.insert(21, "PointerDirection_y",PointerDirection['y'], True)
    dframe.insert(21, "PointerDirection_x",PointerDirection['x'], True)

    PointerRotation = pd.DataFrame.from_records(dframe['PointerRotation'])
    dframe.insert(19, "PointerRotation_z", PointerRotation['z'], True)
    dframe.insert(19, "PointerRotation_y", PointerRotation['y'], True)
    dframe.insert(19, "PointerRotation_x", PointerRotation['x'], True)

    PointerPosition = pd.DataFrame.from_records(dframe['PointerPosition'])
    dframe.insert(18, "PointerPosition_z", PointerPosition['z'], True)
    dframe.insert(18, "PointerPosition_y", PointerPosition['y'], True)
    dframe.insert(18, "PointerPosition_x", PointerPosition['x'], True)

    ParticipantRotation = pd.DataFrame.from_records(dframe['ParticipantRotation'])
    dframe.insert(17, "ParticipantRotation_z",ParticipantRotation['z'], True)
    dframe.insert(17, "ParticipantRotation_y", ParticipantRotation['y'], True)
    dframe.insert(17, "ParticipantRotation_x", ParticipantRotation['x'], True)

    ParticipantPosition = pd.DataFrame.from_records(dframe['ParticipantPosition'])
    dframe.insert(16, "ParticipantPosition_z", ParticipantPosition['z'], True)
    dframe.insert(16, "ParticipantPosition_y", ParticipantPosition['y'], True)
    dframe.insert(16, "ParticipantPosition_x", ParticipantPosition['x'], True)

    return dframe

In [None]:
df = unnest(df)
df.head()

In [None]:
df.to_csv('PointingTask.csv')

In [None]:
PointingTask = pd.read_csv('/Users/tracysanchezpacheco/Documents/PointingTask.csv')

In [None]:
### Merging the two data frames (Data + Ids)

In [None]:
AnglesMerged = df.merge(FinalIds, how='right',
                                      left_on=['SubjectID', 'TrialNumber'],
                                      right_on=['SubjectID', 'TrialID'])
FinalIds.head()

In [None]:
AnglesMerged.columns

In [None]:
# Dropping unnecessary columns
AnglesMerged = AnglesMerged.drop(['TriggerPressedFreq',
                   'CancelPressedFreq', 'PhotoToCenterFreq', 'TriggerPressedTimeStamps',
                   'CancelPressedTimeStamps', 'PhotoToCenterTimeStamps',
                   'PhotoReleasedTimeStamps','ParticipantPosition', 'PointerPosition'], axis=1)

In [None]:
# Creating trial time duration
AnglesMerged['TimeStampEnd'] = pd.to_datetime(AnglesMerged['TimeStampEnd'])
AnglesMerged['TimeStampBegin'] = pd.to_datetime(AnglesMerged['TimeStampBegin'])
AnglesMerged['TimeDelta'] = AnglesMerged['TimeStampEnd'] - AnglesMerged['TimeStampBegin']

Checking if the columns `Trials` and `ImageIndexNumber` coincide, in theory both of them hold the buildings reference number

In [None]:
c = AnglesMerged
c['Equals'] = np.where((c['Trials'] == c['ImageIndexNumber']), 1, 0)

In [None]:
d = c[c['Equals'] == 0 ]
d.head()

In [None]:
AnglesMerged.columns

In [None]:
final = AnglesMerged[['SubjectID', 'ParticipantPosition_x', 'ParticipantPosition_z', 'PointerPosition_x','PointerPosition_z', 'PointerDirection_x', 'PointerDirection_z', 'PointerDirection_y',  'TargetBuildingPosition_x', 'TargetBuildingPosition_z','DistanceToParticipant','avatarID','AvatarPresence', 'AvatarPresenceCategory', 'meaningful','meaningfulBuilding','StartPointID','PointingTaskStartingLocations', 'Trials', 'TrialID', 'ImageIndexNumber','ImageName','TimeDelta','RT', 'TimeOut', 'TimeStampBegin', 'TimeStampEnd','Angle' ]]
final.head()