# Plip dataset

We inspect the plip dataset which will later be used to find protein-ligand interactions

In [1]:
import openpharmacophore.pharmacophore.pl_interactions as pli
import mdtraj as mdt
import nglview as nv
import numpy as np
from rdkit import Chem

from collections import defaultdict
import os
from pprint import pprint



## Load the files

In [2]:
data_dir = "../../datasets/ligand-receptor/test_cases/plip"
os.path.isdir(data_dir)

True

In [3]:
pdbs = []
smi_files = []
pdb_ligands = defaultdict(list)

for root, _, filenames in os.walk(data_dir):
    for file in filenames:
        pdb_name = root.split("/")[-1]
        if file.endswith(".pdb"):
            if len(file.split("_")) == 1:
                pdbs.append(file)
            else:
                pdb_ligands[pdb_name].append(file)
            
        elif file.endswith(".smi"):
            smi_files.append(os.path.join(root, file))

In [4]:
pdbs

['2REG.pdb', '3BBH.pdb', '4MWW.pdb', '1M7W.pdb', '1XDN.pdb']

In [5]:
smi_files

['../../datasets/ligand-receptor/test_cases/plip/2REG/ligands.smi',
 '../../datasets/ligand-receptor/test_cases/plip/3BBH/ligands.smi',
 '../../datasets/ligand-receptor/test_cases/plip/4MWW/ligands.smi',
 '../../datasets/ligand-receptor/test_cases/plip/1M7W/ligands.smi',
 '../../datasets/ligand-receptor/test_cases/plip/1XDN/ligands.smi']

In [6]:
pdb_ligands

defaultdict(list,
            {'2REG': ['CHT_D_chain.pdb', 'CHT_C_chain.pdb', 'CHT_A_chain.pdb'],
             '3BBH': ['SFG_B_chain.pdb',
              'GOL_C_chain.pdb',
              'GOL_F_chain.pdb',
              'GOL_D_chain.pdb',
              'SFG_D_chain.pdb',
              'GOL_A_chain.pdb',
              'GOL_B_chain.pdb',
              'SFG_C_chain.pdb',
              'SFG_A_chain.pdb'],
             '4MWW': ['NAG_A_chain.pdb',
              'BMA_A_chain.pdb',
              'BMA_B_chain.pdb',
              'NAG_B_chain.pdb',
              'G39_A_chain.pdb',
              'MAN_A_chain.pdb',
              'NAG_C_chain.pdb',
              'MAN_B_chain.pdb',
              'G39_C_chain.pdb'],
             '1M7W': ['DAO_F_chain.pdb',
              'DAO_B_chain.pdb',
              'DAO_D_chain.pdb',
              'DAO_H_chain.pdb',
              'DAO_A_chain.pdb',
              'DAO_C_chain.pdb',
              'DAO_E_chain.pdb',
              'DAO_G_chain.pdb'],
             '1XD

## HNF4 alpha ligand binding domain. Hydrophobic interactions

HNF4 alpha ligand binding domain and lauric acid. PDB id is 1M7W

### Extract a chain

In [7]:
def print_traf_info(traj):
    
    print(f"Num atoms: {traj.n_atoms}")
    print(f"Num chains: {traj.n_chains}")
    print(f"Num residues: {traj.n_residues}")
    

def full_file_path(file_name):
    name = file_name.split(".")[0] + "/" + file_name
    return os.path.join(data_dir, name)

In [8]:
index = 3
traj_1 = mdt.load(full_file_path(pdbs[index]))
print_traf_info(traj_1)

Num atoms: 7176
Num chains: 12
Num residues: 960


In [9]:
nv.show_mdtraj(traj_1)

NGLWidget()

In [10]:
df, _ = traj_1.topology.to_dataframe()

In [11]:
df[df["chainID"] == 4]

Unnamed: 0,serial,name,element,resSeq,resName,chainID,segmentID
7046,7051,O1,O,700,DAO,4,
7047,7052,O2,O,700,DAO,4,
7048,7053,C1,C,700,DAO,4,
7049,7054,C2,C,700,DAO,4,
7050,7055,C3,C,700,DAO,4,
7051,7056,C4,C,700,DAO,4,
7052,7057,C5,C,700,DAO,4,
7053,7058,C6,C,700,DAO,4,
7054,7059,C7,C,700,DAO,4,
7055,7060,C8,C,700,DAO,4,


In [12]:
# Extract a single chain with a ligand
chains = [0, 4]
traj_chain_A = traj_1.atom_slice(
        [atom.index for atom in traj_1.topology.atoms if (atom.residue.chain.index in chains)]
    )
print_traf_info(traj_chain_A)

Num atoms: 1791
Num chains: 2
Num residues: 224


In [13]:
nv.show_mdtraj(traj_chain_A)

NGLWidget()

### Inspect the ligand