In [None]:
import os  # Imports the os module for interacting with the operating system's file system
import random  # Imports the random module for making random selections
import shutil  # Imports the shutil module for high-level file operations such as copying and moving files
import json  # Imports the json module for handling JSON data
from Bio import PDB  # Imports the PDB module from Biopython for parsing PDB files
from Bio.SeqUtils import seq1  # Imports the seq1 function from Biopython to convert three-letter amino acid codes to one-letter codes

def extract_chain_info(chain):
    """Extracts the amino acid sequence from a PDB chain."""
    sequence = ""  # Initializes an empty string for the sequence
    for residue in chain:  # Iterates over each residue in the chain
        if PDB.is_aa(residue, standard=True):  # Checks if the residue is a standard amino acid
            sequence += seq1(residue.resname)  # Converts the residue name to one-letter code and appends it to the sequence
    return sequence  # Returns the full amino acid sequence

def process_pdb_file(pdb_file):
    """Parses a PDB file and extracts information about each chain."""
    print(f"Parsing PDB file: {pdb_file}")  # Prints the name of the PDB file being processed
    parser = PDB.PDBParser(QUIET=True)  # Initializes the PDB parser with quiet mode to suppress warnings
    structure = parser.get_structure(os.path.basename(pdb_file), pdb_file)  # Parses the PDB file to get the structure

    jobs = []  # Initializes an empty list to store jobs

    for model in structure:  # Iterates over each model in the structure
        for chain in model:  # Iterates over each chain in the model
            job = {
                "name": os.path.basename(pdb_file).replace(".pdb", ""),  # Sets the name of the job based on the PDB file name
                "modelSeeds": [],  # Initializes an empty list for model seeds (can be populated later)
                "sequences": [
                    {
                        "proteinChain": {
                            "sequence": extract_chain_info(chain),  # Extracts the sequence of the chain
                            "count": 1  # Sets the count to 1 (can be used to indicate number of occurrences)
                        }
                    }
                ]
            }
            jobs.append(job)  # Adds the job to the list of jobs

    print(f"Processed {len(jobs)} chains in {pdb_file}")  # Prints the number of chains processed in the file
    return jobs  # Returns the list of jobs

def save_json_batches(jobs, output_folder):
    """Saves the job data into JSON files in batches."""
    batch_size = 20  # Defines the number of jobs per JSON batch
    for i in range(0, len(jobs), batch_size):  # Iterates over the jobs in steps of batch_size
        batch = jobs[i:i + batch_size]  # Selects a batch of jobs
        batch_number = (i // batch_size) + 1  # Calculates the batch number
        output_file = os.path.join(output_folder, f'pdb_output_batch_{batch_number}.json')  # Constructs the output file path
        print(f"Saving JSON batch {batch_number} with {len(batch)} entries to {output_file}...")  # Prints a message indicating batch saving
        with open(output_file, 'w') as f:  # Opens the output file in write mode
            json.dump(batch, f, indent=2)  # Dumps the batch of jobs into the JSON file with indentation
    print("All batches saved.")  # Prints a message after all batches are saved

def main(input_folder, output_folder):
    """Main function to manage the workflow."""
    print("Starting the main function...")  # Prints a start message
    all_jobs = []  # Initializes an empty list to store all jobs

    if not os.path.exists(input_folder):  # Checks if the input folder exists
        print(f"Input folder does not exist: {input_folder}")  # Prints an error message if the input folder does not exist
        return  # Exits the function if the input folder is missing

    if not os.path.exists(output_folder):  # Checks if the output folder exists
        print(f"Output folder does not exist. Creating: {output_folder}")  # Prints a message indicating the creation of the output folder
        os.makedirs(output_folder)  # Creates the output folder

    pdb_files = [f for f in os.listdir(input_folder) if f.endswith(".pdb")]  # Lists all .pdb files in the input folder
    print(f"Total PDB files found: {len(pdb_files)}")  # Prints the number of PDB files found

    # Randomly select 20 files
    selected_files = random.sample(pdb_files, min(20, len(pdb_files)))  # Randomly selects up to 20 PDB files
    print(f"Randomly selected {len(selected_files)} files for processing.")  # Prints the number of files selected

    try:
        for pdb_file in selected_files:  # Iterates over each selected PDB file
            pdb_path = os.path.join(input_folder, pdb_file)  # Constructs the full path of the PDB file
            # Move the file to the output folder
            shutil.move(pdb_path, os.path.join(output_folder, pdb_file))  # Moves the PDB file to the output folder
            print(f"Moved {pdb_file} to {output_folder}")  # Prints a confirmation message for the move
            # Process the PDB file
            jobs = process_pdb_file(os.path.join(output_folder, pdb_file))  # Processes the PDB file to extract chain info
            all_jobs.extend(jobs)  # Adds the processed jobs to the list of all jobs
    except Exception as e:  # Catches any exceptions that occur
        print(f"An error occurred: {e}")  # Prints the error message
        return  # Exits the function if an error occurs

    save_json_batches(all_jobs, output_folder)  # Saves the processed jobs into JSON batches

if __name__ == "__main__":
    input_folder = r"D:\Research\Part 1 - Comparison\Comparison\E.Coli\ecoli full"  # Specifies the input folder containing PDB files
    output_folder = r"D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2"  # Specifies the output folder for processed data

    main(input_folder, output_folder)  # Calls the main function with the specified input and output folders
    print("Script execution completed.")  # Prints a message indicating that the script has finished running


Starting the main function...
Total PDB files found: 4243
Randomly selected 20 files for processing.
Moved AF-P32056-F1-model_v4.pdb to D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2
Parsing PDB file: D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2\AF-P32056-F1-model_v4.pdb
Processed 1 chains in D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2\AF-P32056-F1-model_v4.pdb
Moved AF-P38135-F1-model_v4.pdb to D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2
Parsing PDB file: D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2\AF-P38135-F1-model_v4.pdb
Processed 1 chains in D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2\AF-P38135-F1-model_v4.pdb
Moved AF-P0AFG0-F1-model_v4.pdb to D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2
Parsing PDB file: D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2\AF-P0AFG0-F1-model_v4.pdb
Processed 1 ch

This script processes Protein Data Bank (PDB) files to extract amino acid sequences for each chain in the structures. It organizes this data into JSON files, which are saved in batches to an output folder. Additionally, it moves the selected PDB files from an input folder to an output folder for organization. It picks 20 random files from the database.

The script handles zipped AlphaFold outputs by unzipping parent and subdirectories (if needed) to access the raw PDB files for processing.

input_folder = r"D:\Research\Part 1 - Comparison\Comparison\E.Coli\ecoli full"  # Input folder containing PDB files
output_folder = r"D:\Research\Part 1 - Comparison\Comparison\E.Coli\AF 2 Prediction\b2"  # Output folder for processed files

Author: Tharun
Time: Sometime in late 2024