In [1]:
import pandas as pd
import os
import time

file_path = "C:/Users/ToddNguyen2/Documents/Research/collapsed"

In [None]:
def progress_bar(current, total, elapsed_time, char_to_use="#"):
    """
    Return a string that details the current progress using hashtags.
    """
    import math
    max_hashtag = 30
    num_hashtags = math.floor(current / total * max_hashtag)
    s = ""
    for i in range(num_hashtags):
        s = "".join((s, char_to_use))
    for i in range(max_hashtag - num_hashtags):
        s = "".join((s, " "))
    
    return_string = "[{}] {:.2%}, {:.2f} seconds collapsed".format(s, current / total, elapsed_time)
    return return_string


def get_stats(pd_col, round_to_digits=4):
    """
    Get the sum, average, and standard deviation of the pandas column pd_col
    
    # ARGUMENTS
    pd_col -> The pandas column to obtain stats
    round_to_digits -> How many number of digits after the decimal point to round to. Defaults to 4
    
    # RETURN
    This will return a list of [sum, average, standard_deviation]
    """
    sum_1 = pd_col.sum()
    # Reference: https://stackoverflow.com/a/775075
    # minutes, seconds = divmod(sum_1, 60)
    # hours, minutes = divmod(minutes, 60)
    # sum_1 = "{:02}:{:02}:{:02}".format(int(hours), int(minutes), int(seconds))
    
    avg_1 = round(pd_col.mean(), round_to_digits)
    stddev_1 = round(pd_col.std(), round_to_digits)
    return [sum_1, avg_1, stddev_1]


def get_df_no_pretask(pandas_df, col_to_obtain):
    """
    Get the sum, average, and standard deviation of the pandas column pd_col of tasks that are NOT pretask.
    
    # ARGUMENTS
    pandas_df -> The pandas dataframe
    col_to_obtain -> Columns to obtain information for
    
    # RETURN
    A pandas dataframe that does not include pretask, and only include columns in col_to_obtain
    """
    new_df = pd.DataFrame(columns=col_to_obtain)
    
    # Iterate through every row and column
    for index,row in pandas_df.iterrows():
        current_task = row['Current_task']
        # Only care about rows that are not pretask
        if current_task != 'pretask':
            list1 = []
            for col in col_to_obtain:
                data = row[col]
                list1.append(data)
            new_df.loc[len(new_df.index)] = list1
    
    return new_df
    

columns_to_obtain = ['FixationDuration', 'Saccade_length', 'Saccade_absolute_angle', 'Saccade_relative_angle']
output_columns = ['ParticipantID',
                  'FixationDurationSum', 'FixationDurationAverage(sec)', 'FixationDurationStdDev(sec)',
                  'Saccade_length_sum', 'Saccade_length_average_(sec)', 'Saccade_length_stddev_(sec)',
                  'Saccade_absolute_angle_sum', 'Saccade_absolute_angle_average_(sec)', 'Saccade_absolute_angle_stddev_(sec)',
                  'Saccade_relative_angle_sum', 'Saccade_relative_angle_average_(sec)', 'Saccade_relative_angle_stddev_(sec)']

stats_df = pd.DataFrame(columns=output_columns)
count = 0
total_count = len(os.listdir(file_path))
starttime = time.time()


for file in os.listdir(file_path):
    if file.endswith(".csv"):
        temp_file_path = os.path.join(file_path, file)
        temp_df = pd.read_csv(temp_file_path, index_col=False)
        cur_task = temp_df["Current_task"]        
        participant_id = file.split("_")[0]        
        output_list = [participant_id]
        
        # Get the dataframe with no pretask
        no_pretask_df = get_df_no_pretask(pandas_df=temp_df, col_to_obtain=columns_to_obtain)
        
        for col in columns_to_obtain:
            col_df = no_pretask_df[col]
            col_stats = get_stats(col_df)
            output_list.extend(col_stats)
            
        # Append to dataframe
        stats_df.loc[len(stats_df.index)] = output_list
        
        endtime = time.time()
        elapsedtime = endtime - starttime
        count += 1
        progress = progress_bar(count, total_count, elapsedtime, char_to_use="=")
        print(progress, end="\r")

print("")
stats_df.to_csv("stats_output.csv", index=False)

