In [None]:
# import datalad.api as dl

# hcp_path = "/Users/Shared/hcp/human-connectome-project-openaccess/"
# ds = dl.Dataset(hcp_path)

# results = ds.status(annex='all')

# ds.get("HCP1200/116726")
# ds.get("HCP1200/118528")
# ds.get("HCP1200/118528/unprocessed")



# Make sure you have created HPC AWS credentials first!

In [None]:
import os
from pathlib import Path
import datalad.api as dl
from datalad.api import Dataset
from datetime import datetime
from contextlib import contextmanager
import logging

In [None]:
# Create logs directory
os.makedirs("logs", exist_ok=True)

# Configure base logging (this will catch setup_dataset logs)
base_log_filename = f"logs/hcp_setup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(base_log_filename),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

@contextmanager
def log_get_operation(subject, data_type, log_dir="logs"):
    """
    Context manager to create separate log file for each ds.get() operation
    
    Args:
        subject: Subject ID (e.g., "116726")
        data_type: Type of data being downloaded (e.g., "T1w_MPR1", "tfMRI_MOTOR_LR")
        log_dir: Directory to save logs
    """
    
    data_type = data_type.replace('/', '_')


    # Create unique log filename
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_filename = f"{log_dir}/get_{subject}_{data_type}_{timestamp}.log"
    
    # Create file handler for this specific operation
    file_handler = logging.FileHandler(log_filename)
    file_handler.setLevel(logging.INFO)
    file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)
    
    # Get the root logger and add file handler
    root_logger = logging.getLogger()
    root_logger.addHandler(file_handler)
    
    # Log start of operation
    logger.info(f"Starting download: Subject {subject}, Data type {data_type}")
    logger.info(f"Operation log file: {log_filename}")
    
    try:
        yield log_filename
    except Exception as e:
        logger.error(f"Error during download: {e}")
        raise
    finally:
        logger.info(f"Completed operation for Subject {subject}, Data type {data_type}")
        # Clean up - remove the file handler
        root_logger.removeHandler(file_handler)
        file_handler.close()

In [None]:
# Configuration
REPO_URL = "https://github.com/datalad-datasets/human-connectome-project-openaccess"
# LOCAL_DATASET_PATH = "/Users/zenkavi/hcp/"
LOCAL_DATASET_PATH = "/Users/zenkavi/Documents/EnkaviLab/data/hcp/"



In [None]:
def setup_dataset(repo_url, local_path):
    """
    Clone or update the HCP dataset
    """
    logger.info(f"Setting up dataset at {local_path}")
    
    if os.path.exists(local_path):
        logger.info("Dataset already exists, updating...")
        ds = Dataset(local_path)
        try:
            ds.update(merge=True)
        except Exception as e:
            if "dubious ownership" in str(e):
                logger.error("Git ownership issue detected!")
                logger.error("To fix this, run:")
                logger.error(f"git config --global --add safe.directory {os.path.abspath(local_path)}")
                logger.error("Or for all HCP subjects:")
                logger.error(f"find {os.path.abspath(local_path)}/HCP1200 -maxdepth 1 -type d -exec git config --global --add safe.directory {{}} \\;")
                raise
            else:
                logger.warning(f"Update failed: {e}. Continuing with existing dataset.")
    else:
        logger.info("Cloning dataset...")
        try:
            ds = dl.clone(repo_url, local_path)
        except Exception as e:
            if "dubious ownership" in str(e):
                logger.error("Git ownership issue during clone!")
                logger.error("To fix this, run:")
                logger.error(f"git config --global --add safe.directory {os.path.abspath(local_path)}")
                logger.error("Then re-run this script.")
                raise
            else:
                raise
    
    return ds

In [None]:
import shutil

def remove_unwanted_directories(ds, subject):
    """
    Remove unwanted directories using file system operations
    """
    
    keep_patterns = [
        "T1w_MPR1",
        "T1w_MPR2", 
        "T2w_SPC1",
        "tfMRI_MOTOR_LR",
        "tfMRI_MOTOR_RL",
        "tfMRI_GAMBLING_LR",
        "tfMRI_GAMBLING_RL"
    ]
    
    rm_paths = ["3T", "7T", "MEG"]

    for rm_path in rm_paths:
        logger.info(f"Removing unwanted data for subject {subject}")
        
        subject_rm_path = Path(ds.path) / "HCP1200" / subject / "unprocessed" / rm_path
        
        if subject_rm_path.exists():
            all_dirs = [d for d in subject_rm_path.iterdir() if d.is_dir()]
            
            for dir_path in all_dirs:
                dir_name = dir_path.name
                
                if dir_name not in keep_patterns:
                    logger.info(f"Removing directory: {dir_path}")
                    try:
                        shutil.rmtree(dir_path)
                        logger.info(f"Successfully removed {dir_name} for {subject}")
                    except Exception as e:
                        logger.error(f"Failed to remove {dir_name} for {subject}: {e}")
                else:
                    logger.info(f"Keeping directory: {dir_name} for {subject}")
        else:
            logger.warning(f"Subject rm directory not found: {subject_rm_path}")


In [None]:
# Your data specifications
subject_list = ["116726", "118528"]
# subject_list = ["116726", "118528", "131823", "135124", "136126", "150524", "152225", "167743", "176542"]

# Setup dataset once
# LOCAL_DATA_PATH shouldn't exist before this
ds = setup_dataset(REPO_URL, LOCAL_DATASET_PATH)

# Download each subject's data with separate logs
for subject in subject_list:
    logger.info(f"Processing subject {subject}")

    patterns = [f"HCP1200/{subject}/unprocessed/3T/T1w_MPR1/", 
                f"HCP1200/{subject}/unprocessed/3T/T1w_MPR2/",
                f"HCP1200/{subject}/unprocessed/3T/T2w_SPC1/",
                f"HCP1200/{subject}/unprocessed/3T/tfMRI_MOTOR_LR/",
                f"HCP1200/{subject}/unprocessed/3T/tfMRI_MOTOR_RL/",
                f"HCP1200/{subject}/unprocessed/3T/tfMRI_GAMBLING_LR/",
                f"HCP1200/{subject}/unprocessed/3T/tfMRI_GAMBLING_RL/"]
    
    # Download structural data
    for pattern in patterns:        
        with log_get_operation(subject, pattern) as log_file:
            try:
                result = ds.get(pattern)
                logger.info(f"Successfully downloaded {pattern}")
            except Exception as e:
                logger.error(f"Failed to download {pattern}")
        remove_unwanted_directories(ds, subject)
    

logger.info("All downloads completed")

In [None]:
# SIMPLEST CASE

# import datalad.api as dl

# REPO_URL = "https://github.com/datalad-datasets/human-connectome-project-openaccess"
# LOCAL_DATASET_PATH = "/Users/zenkavi/hcp/"

# ds = dl.clone(REPO_URL, LOCAL_DATASET_PATH)

# ds.get("HCP1200/116726/unprocessed/3T/T1w_MPR1/")
# ds.get("HCP1200/116726/unprocessed/3T/T1w_MPR2/")
# ds.get("HCP1200/116726/unprocessed/3T/T2w_SPC1/")
# ds.get("HCP1200/116726/unprocessed/3T/tfMRI_MOTOR_LR/")
# ds.get("HCP1200/116726/unprocessed/3T/tfMRI_MOTOR_RL/")
# ds.get("HCP1200/116726/unprocessed/3T/tfMRI_GAMBLING_LR/")
# ds.get("HCP1200/116726/unprocessed/3T/tfMRI_GAMBLING_RL/")

# remove_unwanted_directories(ds, "116726")