In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MolToSmiles, PandasTools, Descriptors
import pandas as pd
import numpy as np
from biopandas.mol2 import PandasMol2
from parsemol2 import parsemol2

In [2]:
import os
path = '/home/taejoon/jyu/scPDB'
folder_list = os.listdir(path) # 17594 개 폴더 -> 17592 '3uf9_1','3acl_1' 삭제함:sdf파일이 없었음
len(folder_list)

17592

In [3]:
aa_dic = ['ASN', 'GLU', 'ASP', 'ILE', 'CYS', 'PHE', 'ALA', 'GLY', 'THR',
       'TYR', 'TRP', 'LYS', 'LEU', 'SER', 'ARG', 'VAL', 'PRO', 'MET',
       'GLN', 'HIS']

fp_db = pd.DataFrame(columns = ['name','scpdb_id','fp'])
prt_db = pd.DataFrame(columns = ['scpdb_id','seq','binding'])
error = []


for i, folder in enumerate(folder_list):
    #ligand finger print
    ligand = PandasTools.LoadSDF('/home/taejoon/jyu/scPDB/'+folder+'/ligand.sdf')
    ligand_fp = AllChem.GetMorganFingerprintAsBitVect(ligand.ROMol[0], radius=2, nBits=1024).ToBitString()
    ligand_fp = np.array(list(ligand_fp), dtype=int)
    #리간드 핑거프린트 표 csv나 dataframe에 한줄씩 넣기
    fp_db.loc[i] = [ligand.ID[0].split('_')[1],ligand.scpdb_id[0],ligand_fp]
    
    bmol = parsemol2().read_mol2('/home/taejoon/jyu/scPDB/'+folder+'/site.mol2')
    
    pmol = parsemol2().read_mol2('/home/taejoon/jyu/scPDB/'+folder+'/protein.mol2') 
    
    if bmol.df.chain.nunique() == 1:
        Chain = bmol.df.chain[0]
    else: 
        Chain = bmol.df.chain.value_counts().index[0]
        error.append(folder)
        
    binding_site = bmol.df[bmol.df.subst_type=='RESIDUE']
    
    binding_num = binding_site.residue_num.values
    
    
    
    prt_df = pmol.df[pmol.df.chain==Chain]
    prt_seq = prt_df[prt_df.sub_type.isin(aa_dic)].sub_type.values
    binding_label = prt_df[prt_df.sub_type.isin(aa_dic)].residue_num.isin(binding_num).values*1
    
    
    #prt_seq: 단백질 시퀀스 어레이, binding_label: 위치
    prt_db.loc[i] = [folder,prt_seq,binding_label]

    

In [5]:
error


['4ylf_3',
 '3tos_4',
 '3nlp_2',
 '3fy4_4',
 '1z0a_3',
 '3uqd_1',
 '3f8d_4',
 '2vyn_2',
 '1dq8_4',
 '1shj_1',
 '2czf_2',
 '3dxj_2',
 '2i4w_1',
 '3dcr_1',
 '4hmx_2',
 '2wq6_1',
 '4zas_2',
 '1l5q_1',
 '4zol_3',
 '1ii6_2',
 '3e33_1',
 '4jdr_1',
 '1rm4_2',
 '4xx0_1',
 '1ihy_3',
 '1lwf_1',
 '4yaf_6',
 '4chq_2',
 '4w6z_2',
 '1t9a_1',
 '3ufr_2',
 '4dfg_1',
 '4ts7_2',
 '3r3c_1',
 '3slz_1',
 '2o1s_2',
 '1zxn_1',
 '1suw_3',
 '2f3m_5',
 '3c15_1',
 '4ifv_1',
 '4b5d_1',
 '3u9f_13',
 '3gga_1',
 '3l9h_2',
 '1gd1_4',
 '2w9s_5',
 '2ibs_1',
 '2el0_1',
 '4e70_1',
 '4o6m_1',
 '2cem_1',
 '4uug_1',
 '4hkk_2',
 '3oib_1',
 '3el1_1',
 '1rt5_1',
 '3hxb_1',
 '2jjx_1',
 '3esz_1',
 '4mcc_1',
 '1n4p_1',
 '2gvg_3',
 '1fw6_2',
 '4mdk_3',
 '1upc_1',
 '4cj5_1',
 '3kvv_9',
 '3ik7_4',
 '1om5_1',
 '1khh_2',
 '4fap_1',
 '1bdr_1',
 '3awh_2',
 '4a9h_3',
 '3tlk_3',
 '4rw6_1',
 '4qog_1',
 '3buz_1',
 '3o9z_4',
 '2q5o_1',
 '2doo_1',
 '4jf1_2',
 '3ufr_1',
 '2ih4_2',
 '2qi7_1',
 '3ufp_3',
 '3icr_3',
 '3r7c_4',
 '2emr_1',
 '1kf6_3'

In [7]:
fp_db.head(5)


Unnamed: 0,name,scpdb_id,fp
0,TES,1jtv_1,0000000100000000100000000000000001001000000000000000000000000000000000010000000000001000100000000000000001000000000000000000000000000000001000000000000000000000000000000000000000000010000000000001000000000000000000000000000000000000010000000010000000100000000000100000000000000000000000000000000000000100000000000010000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000001000000000000000000000000000000000000000000001000000000000000000000000010000000000000000000000001000000000000001000000000100000000000000000000000000000000100000000000000000000000000000000000000000000000000000000000100000000000000000000000000010000000000000000000000000000000000000000000000000000000000000001000000000000010000000000000000000000000000000100000000000000000000000000000000000000000000001000000000000000000000000000000000100000001000000000010000000000000000000000000010000000000000000000000000000010000000011000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000
1,FAD,4ylf_3,0100000001000000000000000000000001000010000001000000000000000010000000010101000110000000001000000000010000000000100000100000000000000000000000000001000010000000000000000000000000000000000000001010000000000000010000000000000000010100010000100000000001000000000010000000000000100000000000000000000000010000000000000010000000010000000000000000000000000000000010000000000101100000001000000000000000000000000000000000000000000000000010000101000000000000000000100000000100000000000000000000001001000000000000000000000000000001000000000000000000000000000010000100000000010000100000100100000000000000000000000000000000100000000000000000000000000000000000000010000010000000000000001000000000000000000010010011100000101000000100000100000000000000010000000000011000000000000000000000000000000001001000000000000000100001000000000000100000000101000000000000000001000010000000010000000000010000100000001001000001100000000010000000000000000000000010010000000000000000000000000010001000000000000000000000000000000000000000000000000000110000
2,SAM,3lcv_1,0100000001000000000000000000000001000000000000000000000000000000000000010001000010000000001000000000000000000000000000000000000000000000000000000001000010000000000000000000000000000000000000000010000000100000010000000000000000000100010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000011000010000000000000001000000000000000000100000000000000000000000000000010000101000000000000000000100100000100000000000000000000000001000000000000000000000000000001000000000000000000000000000010000000001000000000000000000100000000000000000000000001000000100000000010000000000000000000100000000010000010000000000000001000000000000000000100000000100000000000000100000000000000000000000100000000000000000000100010100000000000000000001000000000000000000001000000000000100000000000010000000000000001000000000000000000000000000000000000000000010001000000000000001000000000000000000010010000000000000000000000000010000000000000000000000000000000000000000000000000000000010000
3,ADP,4jl5_1,0000000001000000000000000000000000000000000000000000000000000000000000010001000010000000000000000000000000000000100000000000000000000000000000000001000010000000000000000000000000000000000001001010000000000000010000000000000000000100010000000000000000000000000000000000000000000000000000000000000000010000000000000000000000000000000000000000000000000000000010000000000000000000001000000000000000000000000000000000000000000000000010000101000000000000000000100000000100000000000000000000001001000000000000000000000000000001000000000000000000000000000010000000000000010000000000100100000000000000000000000000000000100000000000000000000000000000000000000010000010000000000000001000000000000000000010010000100000001000000100000000000000000000010000000000000000000000000000100000000000000000001000000000000000000001000000000000100000000000000000000000000001000010000000010000000000000000000000000000000001000000000000000000000000000000000010010000000000000000000000000010000000000000001000000000000000000000000000000000000000010000
4,YM8,2ym8_1,0100001000000001000000000000000011001000000000000000000000001001100010000001000010000000001000000000000000000000100000000000000010100000000000000000000000000000100000010000000000000000000000000000000100000000000000000000000000000000000000000000000000000000010010000010000000000000000100000000000000000001000000000000000000000000000001000000000000000000000010101000001000000000011000000000000000000000010000000000000000000000000010000000000000000010000000000000000000000000000000000000000000010000000000000000000100000000000000000000000010000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000100000000010000000000000000000010000000000000000000000000000001000000010000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000001000010000000000001000000000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000010000000010000000000000000100000000000000010000000000000000


In [8]:
prt_db.head(5)



Unnamed: 0,scpdb_id,seq,binding
0,1jtv_1,"[ALA, ARG, THR, VAL, VAL, LEU, ILE, THR, GLY, CYS, SER, SER, GLY, ILE, GLY, LEU, HIS, LEU, ALA, VAL, ARG, LEU, ALA, SER, ASP, PRO, SER, GLN, SER, PHE, LYS, VAL, TYR, ALA, THR, LEU, ARG, ASP, LEU, LYS, THR, GLN, GLY, ARG, LEU, TRP, GLU, ALA, ALA, ARG, ALA, LEU, ALA, CYS, PRO, PRO, GLY, SER, LEU, GLU, THR, LEU, GLN, LEU, ASP, VAL, ARG, ASP, SER, LYS, SER, VAL, ALA, ALA, ALA, ARG, GLU, ARG, VAL, THR, GLU, GLY, ARG, VAL, ASP, VAL, LEU, VAL, CYS, ASN, ALA, GLY, LEU, GLY, LEU, LEU, GLY, PRO, LEU, GLU, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...]"
1,4ylf_3,"[MET, GLY, GLY, THR, ALA, LEU, ASN, GLU, ILE, VAL, LYS, LYS, VAL, LYS, ILE, ALA, GLU, ASP, VAL, PHE, ASP, PHE, TRP, ILE, HIS, SER, PRO, SER, VAL, SER, LYS, GLU, ALA, ARG, PRO, GLY, GLN, PHE, VAL, VAL, ILE, ARG, LEU, HIS, GLU, LYS, GLY, GLU, ARG, ILE, PRO, LEU, THR, VAL, ALA, ASP, THR, LYS, PRO, GLU, GLU, GLY, LEU, PHE, ARG, MET, VAL, VAL, LYS, VAL, VAL, GLY, LYS, THR, THR, HIS, GLU, LEU, SER, LEU, LYS, LYS, GLU, GLY, ASP, THR, ILE, LEU, ASP, VAL, VAL, GLY, PRO, LEU, GLY, ASN, PRO, SER, GLU, ILE, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
2,3lcv_1,"[ASP, ARG, ILE, ASP, GLU, ILE, GLU, ARG, ALA, ILE, THR, LYS, SER, ARG, ARG, TYR, GLN, THR, VAL, ALA, PRO, ALA, THR, VAL, ARG, ARG, LEU, ALA, ARG, ALA, ALA, LEU, VAL, ALA, ALA, ARG, GLY, ASP, VAL, PRO, ASP, ALA, VAL, LYS, ARG, THR, LYS, ARG, GLY, LEU, HIS, GLU, ILE, TYR, GLY, ALA, PHE, LEU, PRO, PRO, SER, PRO, PRO, ASN, TYR, ALA, ALA, LEU, LEU, ARG, HIS, LEU, ASP, SER, ALA, VAL, ASP, ALA, GLY, ASP, ASP, GLU, ALA, VAL, ARG, ALA, ALA, LEU, LEU, ARG, ALA, MET, SER, VAL, HIS, ILE, SER, THR, ARG, GLU, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, ...]"
3,4jl5_1,"[MET, ILE, LEU, VAL, PHE, LEU, GLY, PRO, PRO, GLY, ALA, GLY, LYS, GLY, THR, GLN, ALA, LYS, ARG, LEU, ALA, LYS, GLU, LYS, GLY, PHE, VAL, HIS, ILE, SER, THR, GLY, ASP, ILE, LEU, ARG, GLU, ALA, VAL, GLN, LYS, GLY, THR, PRO, LEU, GLY, LYS, LYS, ALA, LYS, GLU, TYR, MET, GLU, ARG, GLY, GLU, LEU, VAL, PRO, ASP, ASP, LEU, ILE, ILE, ALA, LEU, ILE, GLU, GLU, VAL, PHE, PRO, LYS, HIS, GLY, ASN, VAL, ILE, PHE, ASP, GLY, PHE, PRO, ARG, THR, VAL, LYS, GLN, ALA, GLU, ALA, LEU, ASP, GLU, MET, LEU, GLU, LYS, LYS, ...]","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"
4,2ym8_1,"[ASP, TRP, ASP, LEU, VAL, GLN, THR, LEU, GLY, GLU, VAL, GLN, LEU, ALA, VAL, ASN, ARG, VAL, THR, GLU, GLU, ALA, VAL, ALA, VAL, LYS, ILE, VAL, ASN, ILE, LYS, LYS, GLU, ILE, CYS, ILE, ASN, LYS, MET, LEU, ASN, HIS, GLU, ASN, VAL, VAL, LYS, PHE, TYR, GLY, HIS, ARG, ARG, GLU, ILE, GLN, TYR, LEU, PHE, LEU, GLU, TYR, CYS, SER, GLY, GLY, GLU, LEU, PHE, ASP, ARG, ILE, GLU, PRO, ASP, ILE, GLY, MET, PRO, GLU, PRO, ASP, ALA, GLN, ARG, PHE, PHE, HIS, GLN, LEU, MET, ALA, GLY, VAL, VAL, TYR, LEU, HIS, GLY, ILE, ...]","[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


In [10]:
prt_db.to_pickle("prt_db.pkl")
fp_db.to_pickle("fp_db.pkl")

In [3]:
import pandas as pd
import numpy as np
from biopandas.mol2 import PandasMol2
from parsemol2 import parsemol2
bmol = parsemol2().read_mol2('/home/taejoon/jyu/scPDB/'+'4hmx_2'+'/site.mol2')

In [4]:
binding_site = bmol.df[bmol.df.subst_type=='RESIDUE']
binding_num = binding_site.residue_num.values

In [6]:
binding_num


array([ 45,  48,  53,  61,  62,  63,  64,  65,  66,  67,  68,  77,  78,
        79,  80,  83,  84,  85,  86, 129, 142, 143, 100, 105, 106, 107,
       109, 183, 185, 187, 193, 195, 197, 210, 211, 212, 402, 403])