Description: This script works on zipped alphafold outputs by unzipping the parent directory and all of the zipped subdirectories for all alphafold structures so that the raw cif files from the predictions can be accessed.

Usage: define paths with:

source_file = r'D:\Research\Scerevisiae\Scerevisiae.tar' # Specifies the correct path to the TAR file extraction_path = r'D:\Research\TempExtraction' # Specifies the path where files will be extracted pdb_output_path = r'D:\Research\PDB Input' # Specifies the path where .pdb files will be moved

and then run

Author: Tharun Time: Sometime in late 2024

In [None]:
import os  # Imports the os module for interacting with the operating system's file system
import shutil  # Imports the shutil module for high-level file operations such as copying and moving files
import tarfile  # Imports the tarfile module for working with tar archives
import gzip  # Imports the gzip module for handling gzip compressed files

def list_directory_contents(dir_path):
    """List the contents of the specified directory."""
    try:
        print(f"\nListing contents of the directory: {dir_path}")  # Prints the directory path being listed
        for item in os.listdir(dir_path):  # Iterates over each item in the directory
            print(item)  # Prints the name of the current item
    except PermissionError:
        print(f"PermissionError: Cannot access {dir_path}. Please check your permissions.")  # Handles permission errors
    except Exception as e:
        print(f"An error occurred while listing directory contents: {e}")  # Handles any other exceptions

def extract_all_tar_files(source_path, extract_path):
    """Extract all tar files from the source path to the extract path."""
    try:
        with tarfile.open(source_path) as tf:  # Opens the tar file in read mode
            tf.extractall(extract_path)  # Extracts all contents to the specified extraction path
            print(f'Extracted: {source_path}')  # Prints a confirmation message
    except tarfile.TarError:
        print(f"Error: {source_path} is not a valid tar file.")  # Handles errors if the file is not a valid tar file
    except PermissionError:
        print(f"PermissionError: Cannot access {source_path}. Please check your permissions.")  # Handles permission errors
    except Exception as e:
        print(f"An error occurred while extracting {source_path}: {e}")  # Handles any other exceptions

    # Process nested tar files
    for root, dirs, files in os.walk(extract_path):  # Walks through the extraction path
        for file in files:  # Iterates over each file found
            if file.endswith('.tar'):  # Checks if the file is a tar file
                nested_tar_path = os.path.join(root, file)  # Constructs the full path of the tar file
                print(f'Found nested tar file: {nested_tar_path}')  # Prints the path of the nested tar file
                try:
                    with tarfile.open(nested_tar_path) as nested_tf:  # Opens the nested tar file
                        nested_tf.extractall(root)  # Extracts the contents in the same directory
                        print(f'Extracted nested tar: {nested_tar_path}')  # Prints a confirmation message
                    os.remove(nested_tar_path)  # Deletes the nested tar file after extraction
                except tarfile.TarError:
                    print(f"Error: {nested_tar_path} is not a valid tar file.")  # Handles errors if the file is not a valid tar file
                except PermissionError:
                    print(f"PermissionError: Cannot access {nested_tar_path}. Please check your permissions.")  # Handles permission errors
                except Exception as e:
                    print(f"An error occurred while extracting {nested_tar_path}: {e}")  # Handles any other exceptions

def decompress_gz_files(source_path):
    """Decompress all .pdb.gz files in the source_path."""
    for root, dirs, files in os.walk(source_path):  # Walks through the source path
        for file in files:  # Iterates over each file found
            if file.endswith('.pdb.gz'):  # Checks if the file has a .pdb.gz extension
                file_path = os.path.join(root, file)  # Constructs the full path of the gzip file
                decompressed_file_path = os.path.join(root, file[:-3])  # Constructs the path for the decompressed file by removing the .gz extension
                try:
                    with gzip.open(file_path, 'rb') as f_in:  # Opens the gzip file in read mode
                        with open(decompressed_file_path, 'wb') as f_out:  # Opens the decompressed file in write mode
                            shutil.copyfileobj(f_in, f_out)  # Copies the content from the gzip file to the decompressed file
                    os.remove(file_path)  # Deletes the original gzip file after decompression
                    print(f'Decompressed: {file_path} to {decompressed_file_path}')  # Prints a confirmation message
                except Exception as e:
                    print(f"An error occurred while decompressing {file_path}: {e}")  # Handles any exceptions during decompression

def move_pdb_files(source_path, dest_path):
    """Move all .pdb files from source_path to dest_path."""
    if not os.path.exists(dest_path):  # Checks if the destination path exists
        os.makedirs(dest_path)  # Creates the destination path if it does not exist

    pdb_files_found = False  # Initializes a flag to track if any .pdb files are found

    for root, dirs, files in os.walk(source_path):  # Walks through the source path
        for file in files:  # Iterates over each file found
            if file.endswith('.pdb'):  # Checks if the file has a .pdb extension
                pdb_files_found = True  # Sets the flag to True indicating a .pdb file was found
                source_file = os.path.join(root, file)  # Constructs the full path of the source file
                dest_file = os.path.join(dest_path, file)  # Constructs the full path of the destination file
                try:
                    shutil.move(source_file, dest_file)  # Moves the file from source to destination
                    print(f'Moved: {source_file} to {dest_file}')  # Prints a confirmation message
                except PermissionError:
                    print(f"PermissionError: Cannot move {source_file}. Please check your permissions.")  # Handles permission errors
                except Exception as e:
                    print(f"An error occurred while moving {source_file}: {e}")  # Handles any other exceptions

    if not pdb_files_found:  # Checks if no .pdb files were found
        print(f"No .pdb files found in {source_path}")  # Prints a message if no .pdb files were found

def clear_directory(path):
    """Clear all contents of a directory."""
    for root, dirs, files in os.walk(path):  # Walks through the directory path
        for file in files:  # Iterates over each file found
            os.remove(os.path.join(root, file))  # Deletes each file
        for dir in dirs:  # Iterates over each directory found
            shutil.rmtree(os.path.join(root, dir))  # Deletes each directory and its contents

if __name__ == "__main__":
    source_file = r'D:\Research\Scerevisiae\Scerevisiae.tar'  # Specifies the correct path to the TAR file
    extraction_path = r'D:\Research\TempExtraction'  # Specifies the path where files will be extracted
    pdb_output_path = r'D:\Research\PDB Input'  # Specifies the path where .pdb files will be moved

    # Ensure the extraction path is clear
    if os.path.exists(extraction_path):  # Checks if the extraction path exists
        clear_directory(extraction_path)  # Clears the contents of the extraction path if it exists
    else:
        os.makedirs(extraction_path)  # Creates the extraction path if it does not exist

    # List contents of the source directory
    list_directory_contents(os.path.dirname(source_file))  # Lists the contents of the directory containing the source file

    # Extract all tar files
    extract_all_tar_files(source_file, extraction_path)  # Extracts all contents of the tar file to the extraction path

    # Decompress .pdb.gz files
    decompress_gz_files(extraction_path)  # Decompresses all .pdb.gz files in the extraction path

    # List contents of the extraction path
    print("\nContents of the extraction path after extraction and decompression:")
    list_directory_contents(extraction_path)  # Lists the contents of the extraction path after extraction and decompression

    # Move .pdb files
    move_pdb_files(extraction_path, pdb_output_path)  # Moves all .pdb files from the extraction path to the output path

    # List contents of the PDB output path
    print("\nContents of the PDB output path:")
    list_directory_contents(pdb_output_path)  # Lists the contents of the output path to confirm the move

    # Clean up the extraction path
    if os.path.exists(extraction_path):  # Checks if the extraction path still exists
        clear_directory(extraction_path)  # Clears the contents of the extraction path
        os.rmdir(extraction_path)  # Removes the now-empty extraction path
        print(f'\nRemoved temporary extraction folder: {extraction_path}')  # Prints a confirmation message



Listing contents of the directory: D:\Research\Scerevisiae
Scerevisiae.tar


  tf.extractall(extract_path)  # Extracts all contents to the specified extraction path


KeyboardInterrupt: 

Description: This script works on zipped alphafold outputs by unzipping the parent directory and all of the zipped subdirectories for all alphafold structures so that the raw cif files from the predictions can be accessed.

Usage: define paths with:

source_file = r'D:\Research\Scerevisiae\Scerevisiae.tar' # Specifies the correct path to the TAR file extraction_path = r'D:\Research\TempExtraction' # Specifies the path where files will be extracted pdb_output_path = r'D:\Research\PDB Input' # Specifies the path where .pdb files will be moved

and then run

Author: Tharun Time: Sometime in late 2024