This script cleans the original event file by keeping only the columns that are needed for the fine-tunig.

In [11]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from moviepy.editor import VideoFileClip

## 1. Creating a master dataset by combining all event files

In [20]:
# Define base paths
base_path = '../data/raw_data/sub-01'

event_files_all = []
start_session = 3 #ses-003
end_session = 14 #ses-013

# Loop through session directories from ses-003 to ses-013
for session in range(start_session, end_session):
    session_str = f'ses-{session:03d}'
    session_path = os.path.join(base_path, session_str)
    
    # Get all event files for the current session
    event_files = [f for f in os.listdir(session_path) if f.endswith('.tsv')]

    # add all event files in a new list  
    event_files_all.append((session_path, event_files))

# Append all event files in a new dataframe
df_all = pd.DataFrame()
for session_path, event_files in event_files_all:
    for event_file in event_files:
        df = pd.read_csv(os.path.join(session_path, event_file), sep='\t')
        
        # Extract run number from the file name
        run_number = event_file.split('_')[3]
        
        # Add the 'run' column before the 'level' column
        level_col_index = df.columns.get_loc('level')
        df.insert(level_col_index, 'run', run_number)
        
        # Concatenate the dataframe
        df_all = pd.concat([df_all, df], ignore_index=True)

# Save the new dataframe to a CSV file   # <-- WARNING: big file
#output_csv_path = '../data/datasets/combined_events.csv'
#df_all.to_csv(output_csv_path, index=False)


## 2. Cleaning the master dataset 
(e.g., select level 1 only, remove unnecesssary columns, etc.)

In [21]:
# create a new dataframe where 'level' column is 'level-01'
df_level = df_all[(df_all['level'] == 'level-1')]

df_level_clean = df_level[['trial_type', 
                           'frame_idx',         # Frame number
                           'onset',             # Onset time in seconds
                           'subject',           # Subject ID
                           'session',           # Session number
                           'run',               # Run number
                           'level',             # Level number
                           'repetition',        # Repetition number
                           'X_screen_total',
                           'X_screen',          # X coordinate of the screen
                           'instantScore',      # Instant score ????
                           'health',            # Health points
                           'status',            # ????
                           'lives',             # Number of lives left
                           'score',             # Current score
                           'shurikens']]
                           # <-- More????

# Rename 'frame_idx' column to 'frame'
df_level_clean = df_level_clean.rename(columns={'frame_idx': 'frame'})

# remove rows when 'trial_type' is 'gym-retro_game'
df_level_clean = df_level_clean[df_level_clean['trial_type'] != 'gym-retro_game']

# List of trial types to process
trial_types = ['RIGHT', 
               'LEFT', 
               'UP', 
               'DOWN', 
               'JUMP', 
               'HIT', 
               'Kill', 
               'HealthGain', 
               'HealthLoss']  
               # <-- Maybe more events in higher levels

# Update 'trial_type' values and delete original rows with specific trial types
for trial_type in trial_types:
    indices_to_delete = []
    indices = df_level_clean.index[df_level_clean['trial_type'] == trial_type].tolist()
    for idx in indices:
        if idx + 1 < len(df_level_clean):
            df_level_clean.at[idx + 1, 'trial_type'] = trial_type
        indices_to_delete.append(idx)
    
    # Drop only the original indices
    df_level_clean = df_level_clean.drop(indices_to_delete)

# reorganize rows by alphabetical order of subject > session > run > level > repetition
df_level_clean = df_level_clean.sort_values(by=['subject', 'session', 'run', 'level', 'repetition'])

# create 'onset_vid' column after 'onset' column
df_level_clean.insert(3, 'onset_vid', 0)

# save the new dataframe to a CSV file
#output_csv_path = '../data/datasets/combined_events_level-1_clean.csv'
#df_level_clean.to_csv(output_csv_path, index=False)

## 3. Adding the time onsets from videos

In [24]:
video_base_path = '../output/videos/videos_full'
output_csv_path = '../data/datasets/combined_events_level-1_clean_outset_vid.csv'

# Iterate through each row in the dataframe to update 'onset_vid' column based on the frame number
for video_file in os.listdir(video_base_path):
    if video_file.endswith('.mp4'):
        video = VideoFileClip(os.path.join(video_base_path, video_file))
        fps = video.fps

        # Extract video information from the file name
        subject, session, _, run, level, repetition = video_file.split('_')
        repetition = repetition.split('.')[0]
        
        # Filter the dataframe for rows corresponding to the current video
        df_filtered = df_level_clean[
            (df_level_clean['subject'] == subject) &
            (df_level_clean['session'] == session) &
            (df_level_clean['run'] == run) &
            (df_level_clean['level'] == level) &
            (df_level_clean['repetition'] == repetition)
            ]
    
        # Update the 'onset_vid' column with onsets from the video
        df_level_clean.loc[df_filtered.index, 'onset_vid'] = (
            df_level_clean.loc[df_filtered.index, 'frame'] / fps
        ).round(3)

# Save the updated dataframe to a CSV file
df_level_clean.to_csv(output_csv_path, index=False)