In [10]:
from rdkit import Chem
import configparser
import glob
import os
import pandas as pd
from Bio import PDB

In [11]:
# Load configuration from the INI file
config = configparser.ConfigParser()
config.read('config.ini')

['config.ini']

In [12]:
#mapping for three letter amino acid sequences to their 1 letter versions
aa3_to_aa1 = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
    'PCA': 'Q' #for some reason the pdb bind data has PCA, which I'm pretty sure doesn't exist, and according to official PDB bind sources, it should be Q.
}

In [26]:
def get_smiles(pdbid):
    sdf_folder_path = os.path.join(config["Paths"]["raw_data_path_refined"],pdbid) if os.path.exists(os.path.join(config["Paths"]["raw_data_path_refined"],pdbid)) else os.path.join(config["Paths"]["raw_data_path_general"],pdbid)
    if not os.path.exists(sdf_folder_path):
        raise FileNotFoundError("could not find the data for the given pdb id", pdbid, sdf_folder_path)
    
    #this is the path to the actual protein pdb file
    sdf_file_path = glob.glob(os.path.join(sdf_folder_path,"*ligand.sdf"))

    #make sure the file exists
    if sdf_file_path:
        sdf_file_path=sdf_file_path[0]
    else:
        raise FileNotFoundError("unable to find protein pdb file", sdf_file_path)
    
    # Read SDF file
    supplier = Chem.SDMolSupplier(sdf_file_path)
    
    # Open output file

    for mol in supplier:
        if mol is not None:
            # Return smiles string
            return Chem.MolToSmiles(mol)
    return ""

get_smiles("1xkk")

'CS(=O)(=O)CC[NH2+]Cc1ccc(-c2ccc3ncnc(Nc4ccc(OCc5cccc(F)c5)c(Cl)c4)c3c2)o1'

In [25]:
pdb_parser = PDB.PDBParser(QUIET=True)  # QUIET=True suppresses warnings

def get_sequence(pdbid):
    pdb_folder_path = os.path.join(config["Paths"]["raw_data_path_refined"],pdbid) if os.path.exists(os.path.join(config["Paths"]["raw_data_path_refined"],pdbid)) else os.path.join(config["Paths"]["raw_data_path_general"],pdbid)
    if not os.path.exists(pdb_folder_path):
        raise FileNotFoundError("could not find the data for the given pdb id", pdbid, pdb_folder_path)
    
    #this is the path to the actual protein pdb file
    pdb_file_path = glob.glob(os.path.join(pdb_folder_path,"*protein.pdb"))

    #make sure the file exists
    if pdb_file_path:
        pdb_file_path=pdb_file_path[0]
    else:
        raise FileNotFoundError("unable to find protein pdb file", pdb_file_path)
    
    protein_structure = pdb_parser.get_structure("protein", pdb_file_path)
    
    for model in protein_structure:
        sequences = []
        for chain in model:
            sequence = ''
            for residue in chain:
                # if PDB.is_aa(residue):
                if str(residue.get_resname()) in aa3_to_aa1:
                    sequence += aa3_to_aa1[residue.get_resname()]
                else: 
                    sequence += "X"
            return sequence
            if sequence != '':
                sequences.append(sequence)
    return ""

get_sequence(pdbid="1xkk")
"MKKGHHHHHHDYDIPTTENLYFQGSGEAPNQALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQG"

'ALLRILKETEFKKIKVLGSGAFGTVYKGLWIPVKIPVAIKELREKANKEILDEAYVMASVDNPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAARNVLVKTPQHVKITDFGLAKLLGAEEKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTFGSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLVIQGDERMSNFYRALMDEVVDADEYLI'

In [14]:
column_names = ["PDB id", "-logKd/Ki", "Kd/Ki"]

#read the index file from one of the datasets. The index file is basically where the real data is. It has the binding affinities.
def read_index_file(dataset_version):
    #handle the different dataset versions. Either using the refined dataset or the general set.
    if dataset_version not in ["general", "refined"]:
        raise ValueError('dataset_version must be either "general" or "refined"')
    config_path_value = "raw_data_path_refined" if dataset_version == "refined" else "raw_data_path_general"

    #default index file is INDEX_refined_data for refined version and INDEX_general_PL_data for general version
    index_file_name="INDEX_refined_data*" if dataset_version == "refined" else "INDEX_general_PL_data*"

    #create the file path
    file_path = glob.glob(os.path.join(config["Paths"][config_path_value], "index", index_file_name))

    #make sure the file exists
    if file_path:
        file_path=file_path[0]
    else:
        raise FileNotFoundError("unable to find index file", os.path.join(config["Paths"]["file_path_refined"],"index",index_file_name))

    #read and load the data using pandas. Only reading columns 0, 3, 4
    data = pd.read_csv(file_path, sep='\s+', header=None,comment='#',usecols=[0, 3, 4])
    data.columns=column_names
    return data


In [20]:
refined_data=read_index_file(dataset_version="refined")
refined_data['Sequence'] = refined_data["PDB id"].apply(get_sequence)
refined_data

Unnamed: 0,PDB id,-logKd/Ki,Kd/Ki
0,2r58,2.00,Kd=10mM
1,3c2f,2.00,Kd=10.1mM
2,3g2y,2.00,Ki=10mM
3,3pce,2.00,Ki=10mM
4,4qsu,2.00,Kd=10mM
...,...,...,...
5311,4f3c,11.82,Ki=1.5pM
5312,5bry,11.82,Ki=0.0015nM
5313,1sl3,11.85,Ki=1.4pM
5314,1ctu,11.92,Ki=1.2pM


In [19]:
refined_data.to_csv(config["Paths"]["output_processed_data_path"],index=False)

In [None]:

# Load the SDF file
supplier = Chem.SDMolSupplier('path_to_your_file.sdf')

# Loop through all molecules in the SDF file
for mol in supplier:
    if mol is not None:  # Check if the molecule is valid
        # Get the SMILES string for each molecule
        smiles = Chem.MolToSmiles(mol)
        print(smiles)
