In [2]:
import os
import tarfile

def extract_selected_files(folder_path):
    """
    1. Scans the specified folder for .tar.gz files.
    2. For each .tar.gz file:
       - Creates a new folder with the same name minus the .tar.gz extension.
       - Extracts only .csv and .h5 files from the 'spatial' folder inside the tar.
       - Preserves the original file names but places them directly in the new folder (no subfolders).
       - Deletes the .tar.gz file after successful extraction.
    """

    # List all items in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".tar.gz"):
            # Build the full path to the .tar.gz file
            tar_file_path = os.path.join(folder_path, filename)

            # Create the new folder name by removing ".tar.gz"
            folder_name = filename.replace(".tar.gz", "")
            new_folder_path = os.path.join(folder_path, folder_name)
            
            # Create the folder if it doesn't exist
            os.makedirs(new_folder_path, exist_ok=True)

            # Open the .tar.gz file
            with tarfile.open(tar_file_path, "r:gz") as tar:
                # Iterate through each member in the tar archive
                for member in tar.getmembers():
                    # We only want files in 'spatial/' that end with .csv or .h5
                    if "spatial/" in member.name and (member.name.endswith(".csv") or member.name.endswith(".h5")):
                        # Extract just the file object (no directories) to memory
                        extracted_file = tar.extractfile(member)
                        if extracted_file is not None:
                            # Get the base filename (e.g., tissue_positions_list.csv or filtered_feature_bc_matrix.h5)
                            base_filename = os.path.basename(member.name)
                            
                            # Create the full path for saving this file in our new folder
                            output_file_path = os.path.join(new_folder_path, base_filename)
                            
                            # Write the extracted data to the new location
                            with open(output_file_path, "wb") as f_out:
                                f_out.write(extracted_file.read())
            
            # After successful extraction, delete the .tar.gz file
            os.remove(tar_file_path)


if __name__ == "__main__":
    # Update this path to point to your A5 folder
    a5_folder_path = r"C:\Users\viraj\BuildSpace\Courses\STAT-530 Bioinformatics\A5"

    extract_selected_files(a5_folder_path)
    print("Extraction completed and original .tar.gz files have been removed.")


Extraction completed and original .tar.gz files have been removed.


In [5]:
import os

def extract_sample_folders(folder_path):
    """
    1. Lists all .tar.gz files in the given folder.
    2. Extracts the code name from each file name by removing the '.tar.gz' extension,
       splitting the remaining name by underscores, and taking the last element.
    3. Constructs an R vector definition with the extracted code names.
    """
    # List all files ending with .tar.gz in the directory
    tar_files = [f for f in os.listdir(folder_path) if f.endswith(".tar.gz")]
    
    # Sort the file list for consistent ordering (optional)
    tar_files.sort()
    
    sample_names = []
    for tar_file in tar_files:
        # Remove the .tar.gz extension
        base_name = tar_file.replace(".tar.gz", "")
        # Split the name by underscores and take the last part as the sample name
        parts = base_name.split("_")
        sample_name = parts[-1]
        sample_names.append(sample_name)
    
    # Build the R vector definition as a string
    r_vector = "sample_folders <- c(\n"
    for i, name in enumerate(sample_names):
        if i < len(sample_names) - 1:
            r_vector += f'  "{name}",\n'
        else:
            r_vector += f'  "{name}"\n'
    r_vector += ")"
    
    return r_vector

if __name__ == "__main__":
    # Update this to the path where your .tar.gz files are located.
    folder_path = r"C:\Users\viraj\BuildSpace\Courses\STAT-530 Bioinformatics\A5"
    
    # Extract the sample names and print the R vector code
    r_code = extract_sample_folders(folder_path)
    print(r_code)


sample_folders <- c(
)
