In [1]:
import numpy as np
import pandas as pd
import math
import Bio.PDB
from Bio.PDB import PDBParser
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Polypeptide import three_to_one
import json
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from tqdm import tqdm
from scipy.stats import entropy

In [2]:
df = pd.read_pickle('df_all_final.pkl')

In [3]:
df

Unnamed: 0,Sequence,Symmetry,Qs_state,Qs_type
0,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C6,monomer,homomer
1,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...,C2,monomer,homomer
2,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C6,monomer,homomer
3,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTSLDAAKSELDKA...,C2,monomer,homomer
4,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...,C6,monomer,homomer
...,...,...,...,...
111540,EVQLLESGGGLVQPGGSLRLSCAASGFTFSYYYMQWVRQAPGKGLE...,C2,dodecamer,hetromer
111541,AVCPGTLNRCEVVMGNLEIVLTGHNADLSFLQWIREVTGYVLVAMN...,C2,dodecamer,hetromer
111542,EVQLLESGGGLVQPGGSLRLSCAASGFTFSYYYMQWVRQAPGKGLE...,C2,dodecamer,hetromer
111543,EVQLLESGGGLVQPGGSLRLSCAASGFTFSYYYMQWVRQAPGKGLE...,C2,dodecamer,hetromer


In [4]:
df['Qs_state'].value_counts()

Qs_state
monomer      32420
dimer        30619
tetramer     19311
trimer       10428
hexamer       7969
octamer       5146
dodecamer     2399
pentamer      1923
decamer       1330
Name: count, dtype: int64

In [58]:
df = df.dropna().reset_index(drop=True)

## Amino Acid Composition

In [59]:
amino_acid_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

In [60]:
def aa_composition(sequence):
    count_aa = []
    X = ProteinAnalysis(sequence)
    pa_dict = X.count_amino_acids()
    for aa in amino_acid_list:
        count_aa.append(pa_dict[aa])
    return count_aa

In [61]:
result = aa_composition('MQNKOUIPQRSTVWXYZACDEFGHL')

## Shannon Entropy 

In [62]:
def compute_entropy(aa_array):
    total_aa = np.sum(aa_array)
    prob_aa = np.array(aa_array) / total_aa
    shannon_entropy = entropy(prob_aa, base=2)
    return shannon_entropy

In [63]:
compute_entropy(result)

4.297079327540664

## Dipeptide Composition

In [64]:
dipeptide_list = []

for i in range(len(amino_acid_list)):
    for j in range(len(amino_acid_list)):
        dipeptide_list.append(amino_acid_list[i] + amino_acid_list[j])

In [65]:
def dipeptide_composition(sequence, dipeptide_list):
    count_dipeptide = []
    for x in dipeptide_list:
        count_dipeptide.append(sequence.count(x))
    return count_dipeptide

In [66]:
#dipeptide_composition('AARAMQNKOUIPQRSTVWXYZACDEFGHL', dipeptide_list)

## Feature Computation For Dataset

In [67]:
sequences = list(df['Sequence'])

In [68]:
mask = []
for seq in sequences:
    if '?' not in seq:
        mask.append(True)
    else:
        mask.append(False)

In [69]:
aa_feature = []
aa_entropy = []
aa_dipeptide = []

In [70]:
for seq in tqdm(sequences):
    aa_count = aa_composition(seq)
    aa_feature.append(aa_count)
    aa_entropy.append(compute_entropy(aa_count))
    aa_dipeptide.append(dipeptide_composition(seq, dipeptide_list))

  prob_aa = np.array(aa_array) / total_aa
100%|██████████| 110597/110597 [00:25<00:00, 4327.01it/s]


In [71]:
print(max(aa_entropy), min(aa_entropy))

4.2674066018716434 0.0


In [72]:
df['aa_composition'] = aa_feature
df['entropy'] = aa_entropy
df['dipeptide_composition'] = aa_dipeptide

df = df[mask].reset_index(drop=True)

In [73]:
df.isnull().sum()

Sequence                 0
Symmetry                 0
Qs_state                 0
Qs_type                  0
aa_composition           0
entropy                  0
dipeptide_composition    0
dtype: int64

In [74]:
df.to_pickle('df_all_seqfeature.pkl')