# Analyze information on targets selected from pisces

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams["figure.dpi"] = 150
sns.set(style='darkgrid')

In [3]:
# Read target csv
df_path = '../../../pisces/20210225/multidomain_target_100_cullpdb_pc20_res2.0_R0.25.csv'
target_df = pd.read_csv(df_path, index_col=0)
target_df

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num,target,1,2,3
0,1BF2A,750,XRAY,2.000,0.160,0.210,1BF2,A,3,1BF2_A,107.000,323.000,344.000
1,1GU7A,364,XRAY,1.700,0.170,0.190,1GU7,A,2,1GU7_A,22.000,231.000,268.000
2,1HM9B,468,XRAY,1.750,0.180,0.220,1HM9,B,2,1HM9_B,40.000,204.000,274.000
3,1K5NA,276,XRAY,1.090,0.120,0.150,1K5N,A,2,1K5N_A,744.000,5190.000,7188.000
4,1L3KA,196,XRAY,1.100,0.160,0.190,1L3K,A,2,1L3K_A,180.000,410.000,480.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5Y9ZA,199,XRAY,1.090,0.160,0.180,5Y9Z,A,2,5Y9Z_A,149.000,285.000,408.000
96,6D0AA,431,XRAY,1.468,0.170,0.190,6D0A,A,2,6D0A_A,74.000,261.000,489.000
97,6FMEB,506,XRAY,1.510,0.170,0.190,6FME,B,3,6FME_B,33.000,204.000,326.000
98,6G7NA,318,XRAY,1.100,0.140,0.150,6G7N,A,2,6G7N_A,25.000,300.000,413.000


In [4]:
target_df['Domain_num'].value_counts()

2    82
3    14
4     3
5     1
Name: Domain_num, dtype: int64

## Check if the target is single-chain or not.

In [14]:
from prody import parsePDB

def getChainNum(pdb_id: str):
    mol = parsePDB(pdb_id)
    chain_array = np.unique(mol.getChids())
    chain_num = chain_array.size
    return chain_array, chain_num

In [15]:
import time

chain_num_list = []
chain_array_list = []
for pdb_id in target_df['PDB_ID']:
    chain_array, chain_num = getChainNum(pdb_id)
    chain_array_list.append(chain_array)
    chain_num_list.append(chain_num)
    time.sleep(3)

@> PDB file is found in working directory (1bf2.pdb.gz).
@> 6143 atoms and 1 coordinate set(s) were parsed in 0.06s.
@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1gu7 downloaded (1gu7.pdb.gz)
@> PDB download via FTP completed (1 downloaded, 0 failed).
@> 6605 atoms and 1 coordinate set(s) were parsed in 0.07s.
@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1hm9 downloaded (1hm9.pdb.gz)
@> PDB download via FTP completed (1 downloaded, 0 failed).
@> 7812 atoms and 1 coordinate set(s) were parsed in 0.07s.
@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1k5n downloaded (1k5n.pdb.gz)
@> PDB download via FTP completed (1 downloaded, 0 failed).
@> 4126 atoms and 1 coordinate set(s) were parsed in 0.05s.
@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1l3k downloaded (1l3k.pdb.gz)
@> PDB download via FTP completed (1 downloaded, 0 failed).
@> 1525 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> Connecting wwPDB FTP server RCSB PDB (USA).
@> 1lc5 downloaded (1lc5.pdb.gz)
@> 

In [20]:
chain_df = pd.DataFrame({'PDB_ID': target_df['PDB_ID'], 'chain_num': chain_num_list, 'chain_array': chain_array_list})
chain_df

Unnamed: 0,PDB_ID,chain_num,chain_array
0,1BF2,1,[A]
1,1GU7,2,"[A, B]"
2,1HM9,2,"[A, B]"
3,1K5N,3,"[A, B, C]"
4,1L3K,1,[A]
...,...,...,...
95,5Y9Z,2,"[A, B]"
96,6D0A,1,[A]
97,6FME,2,"[A, B]"
98,6G7N,2,"[A, B]"


In [21]:
chain_df.sort_values('chain_num')

Unnamed: 0,PDB_ID,chain_num,chain_array
0,1BF2,1,[A]
65,4JB7,1,[A]
63,4I8H,1,[A]
62,4HVK,1,[A]
61,4H27,1,[A]
...,...,...,...
60,4G8T,4,"[A, B, C, D]"
48,3VQT,4,"[A, B, C, D]"
50,3ZSS,4,"[A, B, C, D]"
59,4G68,5,"[A, B, C, D, E]"


In [22]:
def getChainNum_notHetatm(pdb_id: str):
    mol = parsePDB(pdb_id).select('not hetatm')
    chain_array = np.unique(mol.getChids())
    chain_num = chain_array.size
    return chain_array, chain_num

nh_chain_num_list = []
nh_chain_array_list = []
for pdb_id in target_df['PDB_ID']:
    chain_array, chain_num = getChainNum_notHetatm(pdb_id)
    nh_chain_array_list.append(chain_array)
    nh_chain_num_list.append(chain_num)

@> PDB file is found in working directory (1bf2.pdb.gz).
@> 6143 atoms and 1 coordinate set(s) were parsed in 0.21s.
@> PDB file is found in working directory (1gu7.pdb.gz).
@> 6605 atoms and 1 coordinate set(s) were parsed in 0.07s.
@> PDB file is found in working directory (1hm9.pdb.gz).
@> 7812 atoms and 1 coordinate set(s) were parsed in 0.07s.
@> PDB file is found in working directory (1k5n.pdb.gz).
@> 4126 atoms and 1 coordinate set(s) were parsed in 0.06s.
@> PDB file is found in working directory (1l3k.pdb.gz).
@> 1525 atoms and 1 coordinate set(s) were parsed in 0.02s.
@> PDB file is found in working directory (1lc5.pdb.gz).
@> 3056 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> PDB file is found in working directory (1r6d.pdb.gz).
@> 2926 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> PDB file is found in working directory (1rkx.pdb.gz).
@> 12159 atoms and 1 coordinate set(s) were parsed in 0.11s.
@> PDB file is found in working directory (1sf9.pdb.gz).
@> 112

In [38]:
nh_chain_df = pd.DataFrame({'PDB_ID': target_df['PDB_ID'], 'notHetatm_chain_num': nh_chain_num_list, 'notHetatm_chain_array': nh_chain_array_list})
nh_chain_df.sort_values('notHetatm_chain_num')

Unnamed: 0,PDB_ID,notHetatm_chain_num,notHetatm_chain_array
0,1BF2,1,[A]
65,4JB7,1,[A]
63,4I8H,1,[A]
62,4HVK,1,[A]
61,4H27,1,[A]
...,...,...,...
40,3PJ0,4,"[A, B, C, D]"
52,4CFS,4,"[A, B, C, D]"
50,3ZSS,4,"[A, B, C, D]"
60,4G8T,4,"[A, B, C, D]"


In [32]:
chain_df.value_counts('chain_num')

chain_num
1     56
2     30
3      6
4      6
5      1
18     1
dtype: int64

In [39]:
nh_chain_df.value_counts('notHetatm_chain_num')

notHetatm_chain_num
1     58
2     30
4      6
3      5
18     1
dtype: int64

In [40]:
concat_chain_df = pd.merge(chain_df, nh_chain_df, on='PDB_ID')
concat_chain_df

Unnamed: 0,PDB_ID,chain_num,chain_array,notHetatm_chain_num,notHetatm_chain_array
0,1BF2,1,[A],1,[A]
1,1GU7,2,"[A, B]",2,"[A, B]"
2,1HM9,2,"[A, B]",2,"[A, B]"
3,1K5N,3,"[A, B, C]",3,"[A, B, C]"
4,1L3K,1,[A],1,[A]
...,...,...,...,...,...
95,5Y9Z,2,"[A, B]",2,"[A, B]"
96,6D0A,1,[A],1,[A]
97,6FME,2,"[A, B]",2,"[A, B]"
98,6G7N,2,"[A, B]",2,"[A, B]"


In [41]:
concat_chain_df.query('chain_num != notHetatm_chain_num')

Unnamed: 0,PDB_ID,chain_num,chain_array,notHetatm_chain_num,notHetatm_chain_array
23,3A09,2,"[A, B]",1,[A]
29,3IFE,3,"[A, B, C]",1,[A]
51,4A5S,3,"[A, B, C]",2,"[A, B]"
59,4G68,5,"[A, B, C, D, E]",3,"[A, B, C]"


In [44]:
target_chain_df = pd.merge(target_df, concat_chain_df, on='PDB_ID')
target_chain_df

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num,target,1,2,3,chain_num,chain_array,notHetatm_chain_num,notHetatm_chain_array
0,1BF2A,750,XRAY,2.000,0.160,0.210,1BF2,A,3,1BF2_A,107.000,323.000,344.000,1,[A],1,[A]
1,1GU7A,364,XRAY,1.700,0.170,0.190,1GU7,A,2,1GU7_A,22.000,231.000,268.000,2,"[A, B]",2,"[A, B]"
2,1HM9B,468,XRAY,1.750,0.180,0.220,1HM9,B,2,1HM9_B,40.000,204.000,274.000,2,"[A, B]",2,"[A, B]"
3,1K5NA,276,XRAY,1.090,0.120,0.150,1K5N,A,2,1K5N_A,744.000,5190.000,7188.000,3,"[A, B, C]",3,"[A, B, C]"
4,1L3KA,196,XRAY,1.100,0.160,0.190,1L3K,A,2,1L3K_A,180.000,410.000,480.000,1,[A],1,[A]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5Y9ZA,199,XRAY,1.090,0.160,0.180,5Y9Z,A,2,5Y9Z_A,149.000,285.000,408.000,2,"[A, B]",2,"[A, B]"
96,6D0AA,431,XRAY,1.468,0.170,0.190,6D0A,A,2,6D0A_A,74.000,261.000,489.000,1,[A],1,[A]
97,6FMEB,506,XRAY,1.510,0.170,0.190,6FME,B,3,6FME_B,33.000,204.000,326.000,2,"[A, B]",2,"[A, B]"
98,6G7NA,318,XRAY,1.100,0.140,0.150,6G7N,A,2,6G7N_A,25.000,300.000,413.000,2,"[A, B]",2,"[A, B]"


In [45]:
out_path = '../../../pisces/20210225/multidomain_target_100_cullpdb_pc20_res2.0_R0.25_chain_num.csv'
target_chain_df.to_csv(out_path)