In [1]:
# Import the required packages

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast, os
import bambi as bmb
import pymc as pm
import arviz as az
import scipy.stats as stat
from collections import Counter
import itertools


In [2]:
#Set up plotting themes
sns.set_context('poster')
sns.set_style('darkgrid')

# Function to clean the data files

#### For each data file this function does the following:
1. Drops irrelevant columns
2. Computes accuracies (accurate if all correct keys pressed on the first try)
3. Label Node types: Boundary vs non boundary
4. Label transition types: Cross cluster or within cluster
4. Label other relevant conditions like walk length, number of keypresses, etc. 

In [3]:
def clean_data(filename):
    # print(filename)
    # Handle errors. If there is an error, go to 'except' and return nothing.
    try:
        data = pd.read_csv('data/' + filename)

        # Drop instruction rows by dropping rows with missing data in column: 'blocks.thisRepN'
        data = data.dropna(subset=['blocks.thisRepN']).reset_index(drop=True)

        #If data file is incomplete, raise an error. 
        if sum(data['node idx'].notna()) < 1400:
            raise TypeError('Incomplete Data')


        #Rt is average rt of all keys pressed
        data['rt'] = [np.mean(ast.literal_eval(data['key_resp.rt'][i])) if data['accuracy'][i] else np.NaN for i in range(len(data))]
        
        #Transition type is cross cluster if goes from boundary to boundary
        data['transition_type'] = ['cross cluster' if (data['node type'] == 'boundary')[i] & (data['node type'].shift() == 'boundary')[i] else 'within cluster' for i in range(len(data))]

        #Label conditions based on participant number as was designed in the experiment
        if data['participant'][0]%3 == 0:
            data['condition'] = 'random'
        elif data['participant'][0]%3 == 1:
            data['condition'] = 'music random'
        else:
            data['condition'] = 'structured'

        data['trial'] = np.arange(len(data))
        
        
    except:
        return None

    #Count the number of keys to be pressed for each stimuli
    data['num_keypress'] = [len(ast.literal_eval(data['stim'][i])) for i in range(len(data))]
    
    #Return the dataframe with relevant columns
    return data[['participant', 'trial', 'blocks.thisRepN', 'accuracy', 'condition', 'node type', 'transition_type', 'rt', 'stim', 'num_keypress']]


# Read the data files

In [4]:
#reads in all the NAMES of the data files from the 'data' folder. 
data_files = []
for f in os.listdir('data/'):
    if (f.startswith('240') & f.endswith('csv')):
        data_files.append(f)

# Sanity Check 1 
Print out the data_files variable and test if correct files are being loaded

# Cleaning the data files into a single large dataframe

1. Label Trials, conditions, etc. 
2. Combine relevant columns of extracted data files 
3. Specify outliers and remove them


In [18]:
df_clean = pd.concat([clean_data(f) for f in data_files]).reset_index(drop = True)
# df_clean['reset'] = 'False'
# df_clean.loc[df_clean['trial'].values%(df_clean['walk_length'].values+1) == 0, 'reset'] = 'True'



df_clean_rt_outlier = df_clean[np.abs(stat.zscore(df_clean['rt'], nan_policy='omit')) < 3]
# df_clean_rt_outlier['node_transition_type'] = df_clean_rt_outlier['node_type'] + ' ' + df_clean_rt_outlier['transition_type']

# df_clean_participant = df_clean_rt_outlier.groupby(['participant', 'blocks.thisRepN', 'condition', 'node type', 'transition_type']).median(numeric_only = True).reset_index()

# # df_clean_rt_outlier['walk_length'] = df_clean_rt_outlier.walk_length.astype('str')
# # df_clean_rt_outlier['num_keypress'] = df_clean_rt_outlier['num_keypress'].astype(str)

# df_clean_participant

# Sanity Check 2
Print out the dataframe df_clearn_rt_outlier to see if it looks as expected. 

Take a note of the relevant columns

# Plot RTs
The dataframe that is created above (df_clearn_rt_outlier) removes outliers where people may have responded too quickly or too slowly. This is the dataframe we want to work with. Let's start by plotting first. 

Plot a graph using python's seaborn package such that: 
1. The Y axis is the average response time
2. The X axis is the block number
3. The graph has three 'column' panels corresponding to the three conditions.


(Hint: Use the seaborn catplot function)

# Plot Accuracies
That dataframe also has a column called 'accuracy'. Use that colum to plot a graph using python's seaborn package such that: 
1. The Y axis is the average accuracy
2. The X axis is the block number
3. The graph has three 'column' panels corresponding to the three conditions. 