# select Multi domain protein from pisces

## Read pisces text

In [1]:
import numpy as np
import pandas as pd
from prody import parsePDB
import os
import random
import time

In [2]:
pisces_df = pd.read_csv('../../../pisces/20210225/cullpdb_pc20_res2.0_R0.25_d210225_chains7584', delim_whitespace=True)
pisces_df['PDB_ID'] = pisces_df['IDs'].str[: 4]
pisces_df['Chain'] = pisces_df['IDs'].str[4]

In [3]:
pisces_df

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain
0,1A1XA,108,XRAY,2.00,0.21,0.25,1A1X,A
1,1A62A,130,XRAY,1.55,0.22,0.25,1A62,A
2,1A73A,163,XRAY,1.80,0.21,0.30,1A73,A
3,1A92A,50,XRAY,1.80,0.23,0.28,1A92,A
4,1AE9A,179,XRAY,1.90,0.20,0.23,1AE9,A
...,...,...,...,...,...,...,...,...
7579,7L4AA,229,XRAY,1.50,0.15,0.19,7L4A,A
7580,7L9UA,294,XRAY,1.55,0.16,0.17,7L9U,A
7581,7LDQA,223,XRAY,1.15,0.14,0.16,7LDQ,A
7582,7ODCA,424,XRAY,1.60,0.20,0.23,7ODC,A


## Read CATH text

In [4]:
# Read CATH domain text
cath_domain_df = pd.read_csv('../../../CATH/20191109/cath-domain-list.txt', delim_whitespace=True, skiprows=16, header=None)
cath_domain_df = cath_domain_df.drop(list(range(1, 12)), axis=1)
cath_domain_df = cath_domain_df.rename({0: 'CATH_Domain'}, axis=1)
cath_domain_df['IDs'] = cath_domain_df['CATH_Domain'].str[: 5].str.upper()
domain_num_df = pd.DataFrame(cath_domain_df.groupby('IDs').apply(len)).rename({0: 'Domain_num'}, axis=1).reset_index()
domain_num_df

Unnamed: 0,IDs,Domain_num
0,101MA,1
1,102LA,1
2,102MA,1
3,103LA,1
4,103MA,1
...,...,...
329469,9XIAA,1
329470,9XIMA,1
329471,9XIMB,1
329472,9XIMC,1


## Concat pisces df and CATH df

In [5]:
cdf = pd.merge(pisces_df, domain_num_df, on='IDs', how='inner')
cdf

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num
0,1A1XA,108,XRAY,2.00,0.21,0.25,1A1X,A,1
1,1A62A,130,XRAY,1.55,0.22,0.25,1A62,A,2
2,1A73A,163,XRAY,1.80,0.21,0.30,1A73,A,1
3,1A92A,50,XRAY,1.80,0.23,0.28,1A92,A,1
4,1AE9A,179,XRAY,1.90,0.20,0.23,1AE9,A,1
...,...,...,...,...,...,...,...,...,...
4924,6RF9A,288,XRAY,1.80,0.17,0.20,6RF9,A,1
4925,6RI6A,498,XRAY,0.93,0.11,0.12,6RI6,A,3
4926,7A3HA,303,XRAY,0.95,0.11,0.13,7A3H,A,1
4927,7ODCA,424,XRAY,1.60,0.20,0.23,7ODC,A,2


## select Multidomain entries

In [6]:
multidomain_df = cdf.query('Domain_num > 1')
multidomain_df

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num
1,1A62A,130,XRAY,1.55,0.22,0.25,1A62,A,2
7,1AL3A,324,XRAY,1.80,0.18,0.25,1AL3,A,2
11,1ATGA,231,XRAY,1.20,0.16,0.18,1ATG,A,2
15,1B25A,619,XRAY,1.85,0.17,0.22,1B25,A,3
17,1B9WA,95,XRAY,1.80,0.21,0.28,1B9W,A,2
...,...,...,...,...,...,...,...,...,...
4917,6Q4RA,912,XRAY,1.60,0.18,0.21,6Q4R,A,4
4920,6QEJA,378,XRAY,1.62,0.17,0.18,6QEJ,A,2
4925,6RI6A,498,XRAY,0.93,0.11,0.12,6RI6,A,3
4927,7ODCA,424,XRAY,1.60,0.20,0.23,7ODC,A,2


In [7]:
multidomain_df.sort_values('Domain_num')[-10: ]

Unnamed: 0,IDs,length,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num
4370,5BNZA,564,XRAY,1.9,0.19,0.23,5BNZ,A,5
1303,2JE8A,848,XRAY,1.7,0.16,0.19,2JE8,A,5
1652,2VFRA,422,XRAY,1.1,0.14,0.17,2VFR,A,5
1949,3B34A,891,XRAY,1.3,0.18,0.19,3B34,A,5
1740,2WYHA,923,XRAY,1.9,0.18,0.2,2WYH,A,6
2881,3QEXA,903,XRAY,1.73,0.18,0.21,3QEX,A,6
2871,3Q23B,1118,XRAY,1.8,0.2,0.23,3Q23,B,6
1483,2PY5A,575,XRAY,1.6,0.17,0.19,2PY5,A,6
3315,4BBYA,658,XRAY,1.9,0.19,0.24,4BBY,A,6
2514,3K59A,786,XRAY,1.92,0.2,0.2,3K59,A,7


In [8]:
multidomain_df_outpath = '../../../pisces/20210225/multidomain_cullpdb_pc20_res2.0_R0.25.csv'
multidomain_df.to_csv(multidomain_df_outpath)

## Get fasta sequence from multidomain df

In [9]:
from Bio import SeqIO
from pathlib import Path
pdb_fasta_path = '../../../PDBseq/pdb_seqres.txt'
out_fasta_dir = Path('../../../pisces/20210225/multi-domain_fasta/')
records_dict = SeqIO.to_dict(SeqIO.parse(pdb_fasta_path, 'fasta'))

In [10]:
for index, rows in multidomain_df.iterrows():
    seq_id = rows['PDB_ID'].lower() + '_' + rows['Chain']
    try:
        seq = records_dict[seq_id]
    except KeyError:
        print(seq_id)
    else:
        out_path = (out_fasta_dir / seq_id.upper()).with_suffix('.fasta')
        # SeqIO.write(seq, out_path, 'fasta')

5xli_A
