In [1]:
import numpy as np
import pandas as pd
import math
import Bio.PDB
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Polypeptide import three_to_one
import json
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from tqdm import tqdm
from scipy.stats import entropy

In [2]:
df = pd.read_pickle('Sequence_interface_dataset.pkl')

In [3]:
df

Unnamed: 0,Filename,QS_state,QS_type,Symmetry,Chain_name,Sequence,Interfaces,Area_interface,Volume_interface,Planarity_interface,Symm
0,104L.pdb,dimer,Homomer,"C2,C2",A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...,[TEGYKSPSLNAAMGVAGAKSRQ],1081.171052,1959.211619,71.578643,4.137568
1,10GS.pdb,dimer,Homomer,"C2,C2",A,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,[CLYGDLTLYQSTHQQAALDMVDGGP],1639.802349,3123.466070,83.403667,4.574406
2,117E.pdb,dimer,Homomer,C2,A,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...,[RWFPHHIGETIYFPKSIDKWFFI],1174.448255,1873.311200,34.983343,4.263864
3,11AS.pdb,dimer,Homomer,C2,A,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,[AYIQIIEVQAPILSRAVQVKVKALHKLRPDEDYQGVP],2877.227380,9656.993779,211.699849,5.829962
4,11BA.pdb,dimer,Homomer,C2,A,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...,[SAAAKFERQHMDSGNSPSSSYNLMMCCRTFVHESVCGGV],2326.011812,6042.335876,91.126663,6.854454
...,...,...,...,...,...,...,...,...,...,...,...
108949,5CUS.pdb,dodecamer,Heteromer,C2,M,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[QNRRYNSSPPG, STGLSSGLNRY, GEATTGDPVSPPSRNYVSY...",1285.654901,3561.140808,74.262558,3.873111
108950,5CUS.pdb,dodecamer,Heteromer,C2,N,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[GLSSSVGLNRY, GEATTGDPVSPPKSRNYVSYLTTFVQLIPTEG...",1076.897386,2497.628782,51.495621,3.280988
108951,5CUS.pdb,dodecamer,Heteromer,C2,O,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[SGLSSGLNRY, GEATGDPVAPPSSRNYVSYKLTEPFATSQLTAE...",1188.844358,3414.091866,69.186915,3.807491
108952,5D17.pdb,dodecamer,Homomer,D3,A,QDATNYNSIFANRFAAFDELLSILKTKFACRVLFEETLVLPKVGRS...,"[GLGELQVS, TYSIFANDGSP, GRSRLCKDGGVSSL]",375.821023,345.402726,24.310102,2.277767


In [4]:
amino_acid_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
def aa_composition(sequence):
    count_aa = []
    X = ProteinAnalysis(sequence)
    pa_dict = X.count_amino_acids()
    for aa in amino_acid_list:
        count_aa.append(pa_dict[aa])
    return count_aa

In [5]:
def compute_entropy(aa_array):
    total_aa = np.sum(aa_array)
    prob_aa = np.array(aa_array) / total_aa
    shannon_entropy = entropy(prob_aa, base=2)
    return shannon_entropy

In [6]:
dipeptide_list = []

for i in range(len(amino_acid_list)):
    for j in range(len(amino_acid_list)):
        dipeptide_list.append(amino_acid_list[i] + amino_acid_list[j])

In [7]:
def dipeptide_composition(sequence, dipeptide_list):
    count_dipeptide = []
    for x in dipeptide_list:
        count_dipeptide.append(sequence.count(x))
    return count_dipeptide

In [8]:
sequences = list(df['Sequence'])

In [9]:
mask = []
for seq in sequences:
    if '?' not in seq:
        mask.append(True)
    else:
        mask.append(False)

In [10]:
aa_feature = []
aa_entropy = []
aa_dipeptide = []

In [11]:
for seq in tqdm(sequences):
    aa_count = aa_composition(seq)
    aa_feature.append(aa_count)
    aa_entropy.append(compute_entropy(aa_count))
    aa_dipeptide.append(dipeptide_composition(seq, dipeptide_list))

100%|██████████| 108954/108954 [00:23<00:00, 4583.34it/s]


In [12]:
df['aa_composition'] = aa_feature
df['entropy'] = aa_entropy
df['dipeptide_composition'] = aa_dipeptide

df = df[mask].reset_index(drop=True)

In [13]:
df.isnull().sum()

Filename                 0
QS_state                 0
QS_type                  0
Symmetry                 0
Chain_name               0
Sequence                 0
Interfaces               0
Area_interface           0
Volume_interface         0
Planarity_interface      0
Symm                     0
aa_composition           0
entropy                  0
dipeptide_composition    0
dtype: int64

In [14]:
df.to_pickle('df_structand_seqfeature.pkl')

In [15]:
df1=pd.read_pickle('df_structand_seqfeature.pkl')

In [16]:
df1

Unnamed: 0,Filename,QS_state,QS_type,Symmetry,Chain_name,Sequence,Interfaces,Area_interface,Volume_interface,Planarity_interface,Symm,aa_composition,entropy,dipeptide_composition
0,104L.pdb,dimer,Homomer,"C2,C2",A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...,[TEGYKSPSLNAAMGVAGAKSRQ],1081.171052,1959.211619,71.578643,4.137568,"[36, 26, 22, 20, 0, 10, 16, 22, 2, 20, 30, 26,...",4.044585,"[10, 0, 0, 0, 0, 0, 4, 2, 0, 2, 2, 8, 0, 0, 0,..."
1,10GS.pdb,dimer,Homomer,"C2,C2",A,PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASC...,[CLYGDLTLYQSTHQQAALDMVDGGP],1639.802349,3123.466070,83.403667,4.574406,"[30, 16, 16, 26, 8, 26, 20, 36, 4, 14, 64, 24,...",4.052816,"[4, 2, 0, 4, 0, 0, 0, 2, 0, 0, 6, 0, 0, 4, 2, ..."
2,117E.pdb,dimer,Homomer,C2,A,TYTTRQIGAKNTLEYKVYIEKDGKPVSAFHDIPLYADKENNIFNMV...,[RWFPHHIGETIYFPKSIDKWFFI],1174.448255,1873.311200,34.983343,4.263864,"[22, 6, 16, 23, 1, 7, 20, 15, 6, 27, 18, 29, 2...",4.070819,"[1, 0, 0, 2, 0, 0, 0, 1, 0, 2, 3, 3, 0, 3, 2, ..."
3,11AS.pdb,dimer,Homomer,C2,A,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,[AYIQIIEVQAPILSRAVQVKVKALHKLRPDEDYQGVP],2877.227380,9656.993779,211.699849,5.829962,"[29, 19, 3, 24, 0, 20, 23, 29, 13, 14, 40, 14,...",4.025078,"[2, 0, 0, 1, 0, 1, 1, 4, 0, 2, 4, 4, 0, 1, 3, ..."
4,11BA.pdb,dimer,Homomer,C2,A,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...,[SAAAKFERQHMDSGNSPSSSYNLMMCCRTFVHESVCGGV],2326.011812,6042.335876,91.126663,6.854454,"[8, 4, 7, 4, 10, 6, 5, 6, 4, 3, 2, 14, 5, 3, 5...",4.051906,"[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108844,5CUS.pdb,dodecamer,Heteromer,C2,M,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[QNRRYNSSPPG, STGLSSGLNRY, GEATTGDPVSPPSRNYVSY...",1285.654901,3561.140808,74.262558,3.873111,"[17, 6, 6, 7, 4, 9, 8, 17, 2, 5, 15, 11, 0, 4,...",3.951123,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, ..."
108845,5CUS.pdb,dodecamer,Heteromer,C2,N,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[GLSSSVGLNRY, GEATTGDPVSPPKSRNYVSYLTTFVQLIPTEG...",1076.897386,2497.628782,51.495621,3.280988,"[16, 5, 6, 7, 4, 8, 6, 17, 1, 5, 15, 8, 0, 4, ...",3.926629,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, ..."
108846,5CUS.pdb,dodecamer,Heteromer,C2,O,SVLTQPPSASGTPGQRVTISCSGSLSNIGLNYVSWYQQLPGTAPKL...,"[SGLSSGLNRY, GEATGDPVAPPSSRNYVSYKLTEPFATSQLTAE...",1188.844358,3414.091866,69.186915,3.807491,"[18, 5, 6, 7, 4, 10, 8, 16, 0, 5, 15, 10, 0, 4...",3.914308,"[3, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 3, ..."
108847,5D17.pdb,dodecamer,Homomer,D3,A,QDATNYNSIFANRFAAFDELLSILKTKFACRVLFEETLVLPKVGRS...,"[GLGELQVS, TYSIFANDGSP, GRSRLCKDGGVSSL]",375.821023,345.402726,24.310102,2.277767,"[8, 13, 8, 10, 2, 5, 12, 13, 3, 4, 17, 10, 0, ...",3.998582,"[1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ..."
