In [1]:
!conda env list

# conda environments:
#
base                     /home6/tvnguye4/miniconda3
tvnEnv0003_colabfold     /home6/tvnguye4/miniconda3/envs/tvnEnv0003_colabfold
tvnEnv0005_cobra         /home6/tvnguye4/miniconda3/envs/tvnEnv0005_cobra
tvnEnv0006_genAna     *  /home6/tvnguye4/miniconda3/envs/tvnEnv0006_genAna



In [None]:
from Bio.PDB import PDBParser, PDBList, is_aa
from Bio.SeqUtils import seq1
import os
import shutil

# List of PDB codes to retrieve
pdb_codes = ["4CNR", "4CNV", "4CNW", "4CNX"]

# Initialize PDB parser
parser = PDBParser()

# Initialize PDBList for downloading PDB files
pdb_list = PDBList()

# Directory to store the final FASTA files
fasta_directory = '/home6/tvnguye4/google_cloud/ca_grand_garden/halo_mutants/pdb_sequence'
os.makedirs(fasta_directory, exist_ok=True)

# Folder to store temporary PDB files
pdb_folder = 'pdb_files'
os.makedirs(pdb_folder, exist_ok=True)

for pdb_code in pdb_codes:
    # Retrieve PDB file
    file_path = pdb_list.retrieve_pdb_file(pdb_code, pdir=pdb_folder, file_format='pdb')
    
    # Parse the structure from file
    structure = parser.get_structure(pdb_code, file_path)
    
    # Loop through each model, chain, and residue to extract the sequence
    for model in structure:
        for chain in model:
            sequence = []
            chain_id = chain.id
            for residue in chain:
                if is_aa(residue, standard=True):
                    sequence.append(seq1(residue.resname))
            
            # File to store the protein sequence in FASTA format for each chain
            fasta_filename = os.path.join(fasta_directory, f"{pdb_code}_chain_{chain_id}.fasta")
            
            with open(fasta_filename, 'w') as fasta_file:
                # Write to FASTA file
                fasta_file.write(f">PDB:{pdb_code} Chain:{chain_id}\n{''.join(sequence)}\n")

# Optionally, remove the temporary PDB files directory if not needed
shutil.rmtree(pdb_folder)
