In [1]:
from torch.utils.data import Dataset
import os
import logging
import traceback
import shutil
import zipfile
import random
import re
import numpy as np
import contextlib
import mne

In [2]:
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

### Downloading the dataset

In [None]:
dataset_path = os.path.join('data', 'openfmri')
default_download_urls = [  # Links to normalized data of all participants
    # "https://s3.amazonaws.com/openneuro/ds000117/ds000117_R1.0.0/compressed/ds000117_R1.0.0_derivatives_sub01-04.zip",
    # "https://s3.amazonaws.com/openneuro/ds000117/ds000117_R1.0.0/compressed/ds000117_R1.0.0_derivatives_sub05-08.zip",
    # "https://s3.amazonaws.com/openneuro/ds000117/ds000117_R1.0.0/compressed/ds000117_R1.0.0_derivatives_sub09-12.zip",
    # "https://s3.amazonaws.com/openneuro/ds000117/ds000117_R1.0.0/compressed/ds000117_R1.0.0_derivatives_sub13-16.zip"
]

def download_dataset(dataset_path, download_urls):
    """
    Downloads the dataset from the specified URLs to the given dataset path.

    Parameters:
        dataset_path (str): The directory path where the dataset will be downloaded.
        download_urls (list): A list of URLs from which to download the dataset.

    Returns:
        None
    """
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    
    downloaded_marker = os.path.join(dataset_path, '.downloaded')
    
    if not os.path.exists(downloaded_marker):
        logger.info(f"Downloading {len(download_urls)} files...")
        for url in download_urls:
            file_name = os.path.join(dataset_path, url.split('/')[-1])
            if not os.path.exists(file_name):
                logger.info(f"Downloading {file_name}...")
                try:
                    os.system(f"wget -O {file_name} {url}")
                except Exception as e:
                    logger.error(f"Error downloading file {file_name}, {e}")
                    traceback.print_exc()
            else:
                logger.info(f"{file_name} already exists, skipping download.")
        
        # Check if all files are downloaded
        downloaded_files = [f for f in os.listdir(dataset_path) if f.endswith('.zip')]
        if len(downloaded_files) == len(download_urls):
            with open(downloaded_marker, 'w') as f:
                f.write('Download completed successfully.')
            logger.info(f"Successfully downloaded {len(downloaded_files)} files.")
    else:
        logger.info("Dataset already downloaded. Skipping download.")

In [None]:
# download_dataset(dataset_path, default_download_urls)

### Unzipping & Renaming the dataset

In [21]:
from tqdm import tqdm

def unzip_and_rename_in_folder(folder, remove=False):
    """
    Unzips all zip files in the specified folder and renames the extracted folders.

    Parameters:
        folder (str): The path to the folder containing zip files to be unzipped.
        remove (bool): If True, the zip files will be deleted after extraction.

    Returns:
        None
    """
    unzipped_marker = os.path.join(folder, '.unzipped')
    if os.path.exists(unzipped_marker):
        logger.info(f"Folder {folder} is already unzipped. Exiting early.")
        return

    assert all(f.endswith('.zip') or f.startswith('.') for f in os.listdir(folder)), (
        f"Not all files in {folder} are zip files or ignored files. Please delete non-zip files and re-run."
    )
    zip_file_count = sum(1 for f in os.listdir(folder) if f.endswith('.zip'))
    logger.info(f"Unzipping and renaming {zip_file_count} files in folder: {folder}")
    
    # Add a progress bar for unzipping
    with tqdm(total=zip_file_count, desc="Unzipping files", unit="file") as pbar:
        for zip_file in os.listdir(folder):
            zip_file_path = os.path.join(folder, zip_file)
            if zip_file.endswith('.zip'):
                logger.info(f"Unzipping {zip_file_path}...")
                try:
                    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                        zip_ref.extractall(os.path.dirname(zip_file_path))
                    if remove:
                        os.remove(zip_file_path)  # Remove the zip file after extraction
                        logger.info(f"Unzipped and removed {zip_file_path}")
                    else:
                        logger.info(f"Unzipped {zip_file_path}")
                except Exception as e:
                    logger.error(f"Error unzipping {zip_file_path}, {e}")
                    traceback.print_exc()
                pbar.update(1)  # Update the progress bar

    for i, unzipped_folder in enumerate(
        f for f in os.listdir(folder) if not f.endswith('.zip') and not f.startswith('.')
    ):
        from_name = os.path.join(folder, unzipped_folder)
        to_name = os.path.join(folder, f'folder_{i}')
        os.rename(from_name, to_name)

    # Create the .unzipped marker file
    with open(unzipped_marker, 'w') as marker_file:
        marker_file.write('')

In [23]:
unzip_and_rename_in_folder(dataset_path)

INFO:__main__:Unzipping and renaming 1 files in folder: data/openfmri
Unzipping files:   0%|          | 0/1 [00:00<?, ?file/s]INFO:__main__:Unzipping data/openfmri/ds000117_R1.0.0_derivatives_sub05-08.zip...
INFO:__main__:Unzipped data/openfmri/ds000117_R1.0.0_derivatives_sub05-08.zip
Unzipping files: 100%|██████████| 1/1 [02:00<00:00, 120.68s/file]


### Arranging folders

In [32]:
def arrange_folders(dataset_folder):
    """
    Rearranges the files in the dataset folder after downloading and unzipping.

    This function organizes the dataset by moving participant folders into the main dataset folder,
    extracting the necessary files, and cleaning up unnecessary directories.

    Parameters:
        dataset_folder (str): The path to the main dataset folder.

    Returns:
        None
    """
    arranged_marker = os.path.join(dataset_folder, '.arranged')
    if os.path.exists(arranged_marker):
        logger.info(f"Dataset folder {dataset_folder} is already arranged. Exiting early.")
        return

    # Step 1: Move participant folders from 'folder_0/derivatives/meg_derivatives' to 'dataset_folder'
    source_folder = os.path.join(dataset_folder, 'folder_0', 'derivatives', 'meg_derivatives')
    if not os.path.exists(source_folder):
        logger.error(f"Source folder {source_folder} does not exist.")
        return

    participant_folders = [
        f for f in os.listdir(source_folder)
        if os.path.isdir(os.path.join(source_folder, f))
        and f.startswith('sub-')  # Assuming participant folders start with 'sub-'
    ]

    # Move participant folders to dataset_folder
    for participant_folder in participant_folders:
        source = os.path.join(source_folder, participant_folder)
        destination = os.path.join(dataset_folder, participant_folder)
        if os.path.exists(destination):
            logger.warning(f"Destination folder {destination} already exists. Skipping.")
        else:
            shutil.move(source, dataset_folder)
            logger.info(f"Moved {source} to {dataset_folder}.")

    # Step 2: For each participant, move files from 'ses-meg/meg/' to participant folder and delete subfolders
    participant_folders = [
        f for f in os.listdir(dataset_folder)
        if os.path.isdir(os.path.join(dataset_folder, f))
        and f.startswith('sub-')
    ]

    for participant_folder in participant_folders:
        participant_folder_path = os.path.join(dataset_folder, participant_folder)
        source_subfolder = os.path.join(participant_folder_path, 'ses-meg', 'meg')
        if not os.path.exists(source_subfolder):
            logger.warning(f"Expected directory {source_subfolder} does not exist. Skipping.")
            continue

        # Move contents of 'ses-meg/meg' to participant folder
        move_contents_to_parent_and_delete_sub(
            source_subfolder,
            participant_folder_path,
            delete_empty_parent_dirs=True  # New parameter to delete empty parent directories
        )

    # Step 3: Remove unnecessary folders
    folder_0_path = os.path.join(dataset_folder, 'folder_0')
    if os.path.exists(folder_0_path):
        shutil.rmtree(folder_0_path)
        logger.info(f"Removed unnecessary folder {folder_0_path}.")

    # Create the .arranged marker file
    with open(arranged_marker, 'w') as marker_file:
        marker_file.write('')

    logger.info("Arrangement of folders completed successfully.")


def move_contents_to_parent_and_delete_sub(source_folder, destination_folder, delete_empty_parent_dirs=False):
    """
    Moves all contents from the source folder to the destination folder and deletes the source folder.
    Optionally deletes any empty parent directories of the source folder.

    Parameters:
        source_folder (str): The absolute path of the source folder whose contents are to be moved.
        destination_folder (str): The absolute path to the destination folder where contents will be moved.
        delete_empty_parent_dirs (bool): If True, delete empty parent directories after moving contents.

    Returns:
        None
    """
    if not os.path.exists(source_folder):
        logger.warning(f"Source folder {source_folder} does not exist. Skipping.")
        return

    # Move contents of source_folder to destination_folder
    for item in os.listdir(source_folder):
        source_item = os.path.join(source_folder, item)
        destination_item = os.path.join(destination_folder, item)
        if os.path.exists(destination_item):
            logger.warning(f"Destination item {destination_item} already exists. Skipping.")
            continue
        shutil.move(source_item, destination_folder)
        logger.info(f"Moved {source_item} to {destination_folder}.")

    # Delete the source folder
    shutil.rmtree(source_folder)
    logger.info(f"Removed folder {source_folder}.")

    # Optionally delete empty parent directories
    if delete_empty_parent_dirs:
        current_path = os.path.dirname(source_folder)
        while current_path != destination_folder:
            if not os.listdir(current_path):
                os.rmdir(current_path)
                logger.info(f"Removed empty parent directory {current_path}.")
                current_path = os.path.dirname(current_path)
            else:
                break

In [None]:
arrange_folders(dataset_path)

In [None]:
def rename_dataset_files(dataset_path):
    """
    Renames all files in the dataset according to specified pattern.
    
    For each participant folder (sub-XX), it finds files matching the pattern:
    'sub-XX_ses-meg_task-facerecognition_run-YY_proc-tsss_meg.fif' or '.txt'
    and renames them to 'run_YY.fif' or 'run_YY.txt'.

    Parameters:
        dataset_path (str): The path to the dataset folder containing participant folders.

    Returns:
        None
    """
    # Get all participant folders (directories starting with 'sub-')
    participant_folders = [
        f for f in os.listdir(dataset_path)
        if os.path.isdir(os.path.join(dataset_path, f)) and f.startswith('sub-')
    ]

    for participant_folder in participant_folders:
        participant_path = os.path.join(dataset_path, participant_folder)
        
        # Process each file in the participant folder
        for file_name in os.listdir(participant_path):
            # Match the pattern for .fif and .txt files
            match = re.match(
                r'^sub-\d+_ses-meg_task-facerecognition_run-(\d{2})_proc-tsss_(meg|log)\.(fif|txt)$',
                file_name
            )
            
            if match:
                run_number = match.group(1)
                file_type = match.group(2)
                extension = match.group(3)
                
                # Determine new filename based on file type
                if file_type == 'meg':
                    new_file_name = f'run_{run_number}.fif'
                elif file_type == 'log':
                    new_file_name = f'run_{run_number}.txt'
                else:
                    continue
                
                # Create full paths for source and destination
                source = os.path.join(participant_path, file_name)
                destination = os.path.join(participant_path, new_file_name)
                
                # Rename file if destination doesn't exist
                if os.path.exists(destination):
                    logger.warning(f"Destination file {destination} already exists. Skipping renaming of {source}.")
                    continue
                    
                try:
                    os.rename(source, destination)
                    logger.info(f"Renamed {source} to {destination}.")
                except Exception as e:
                    logger.error(f"Failed to rename {source} to {destination}: {e}")
            else:
                logger.debug(f"File {file_name} does not match the expected pattern. Skipping.")

In [None]:
rename_dataset_files(dataset_path)

### Randomize data

In [38]:
def randomize_subject_data(dataset_folder, train_percentage=70, val_percentage=20):
    """
    Randomizes and splits participant data into training, validation, and test sets.

    Parameters:
        dataset_folder (str): The path to the main dataset folder.
        train_percentage (int): The percentage of data allocated to the training set.
        val_percentage (int): The percentage of data allocated to the validation set.

    Returns:
        None
    """
    participant_folders = [
        f for f in os.listdir(dataset_folder) if os.path.isdir(os.path.join(dataset_folder, f))
    ]
    random.seed(42)
    random.shuffle(participant_folders)

    total_participants = len(participant_folders)
    train_count = int(total_participants * train_percentage / 100)
    val_count = int(total_participants * val_percentage / 100)

    train_folder = os.path.join(dataset_folder, 'train')
    val_folder = os.path.join(dataset_folder, 'val')
    test_folder = os.path.join(dataset_folder, 'test')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    for i, participant_folder in enumerate(participant_folders):
        participant_folder_path = os.path.join(dataset_folder, participant_folder)
        if i < train_count:
            shutil.move(participant_folder_path, train_folder)
        elif i < train_count + val_count:
            shutil.move(participant_folder_path, val_folder)
        else:
            shutil.move(participant_folder_path, test_folder)

In [39]:
randomize_subject_data(dataset_path, train_percentage=70, val_percentage=20)
# Might need a manual re-shuffling after, if no subject folders are in the val or train or test dir after you run this function.