In [2]:
cd ../../src/

/Users/in-divye.singh/Documents/Projects/MIC_predictor/src


In [3]:
import biovec
import numpy as np
import pandas as pd
from itertools import chain, combinations
from collections import Counter

from utils import *

from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [4]:
family = pd.read_csv("../data/raw/712pep_family.csv")

In [5]:
family['Family'].unique()

array(['Arenaviridae', 'Arteriviridae', 'Asfarviridae', 'Bunyaviridae',
       'Coronaviridae', 'Family', 'Filoviridae', 'Flaviviridae',
       'Hepadnaviridae', 'Herpesviridae', 'Orthomyxoviridae',
       'Papillomaviridae', 'Paramyxoviridae', 'Polyomaviridae',
       'Poxviridae', 'Retroviridae'], dtype=object)

In [6]:
flaviviridae_seq = family[(family['Family'] == "Flaviviridae")].reset_index(drop=True)

In [7]:
flaviviridae_seq

Unnamed: 0,Sequence,Abbreviation,Family
0,AAQRRGRVGRNPNQVGD,HCV,Flaviviridae
1,RNPSQVGD,WNV,Flaviviridae
2,RVGRNPNQVGD,HCV,Flaviviridae
3,AAQRRGRIGRNPSQVGD,HCV,Flaviviridae
4,RGRRGIYR,HCV,Flaviviridae
...,...,...,...
125,TWLRAIWDWVCTALTDFK,HCV,Flaviviridae
126,SWLRDVWDWVCTVLSDFK,HCV,Flaviviridae
127,GAIVSTALPQWRIYSYAG,HCV,Flaviviridae
128,SWLRDIWDWLCELLSDFK,HCV,Flaviviridae


In [8]:
def get_physicochemical_properties(df):
    params = ['molecular_weight', 'aromaticity', 'instability_index',
              'isoelectric_point', 'helix', 'turn', 'sheet', 'with_reduced_cysteines',
              'with_disulfid_bridges', 'gravy', 'net_charge_at_pH7point4']

    prop = []
    for seq in df.Sequence:
        X = ProteinAnalysis(seq)
        molecular_weight = X.molecular_weight()
        aromaticity = X.aromaticity()
        instability_index = X.instability_index()
        isoelectric_point = X.isoelectric_point()
        sec_struc = X.secondary_structure_fraction()
        helix = sec_struc[0]
        turn = sec_struc[1]
        sheet = sec_struc[2]
        epsilon_prot = X.molar_extinction_coefficient()
        with_reduced_cysteines = epsilon_prot[0]
        with_disulfid_bridges = epsilon_prot[1]
        gravy = X.gravy() # hydrophobicity related
        # flexibility = X.flexibility()
        # X.protein_scale()
        net_charge_at_pH7point4 = X.charge_at_pH(7.4)

        prop.append([molecular_weight, aromaticity, instability_index, isoelectric_point, helix, turn, sheet,
                     with_reduced_cysteines, with_disulfid_bridges, gravy, net_charge_at_pH7point4])
    return pd.DataFrame(prop, columns=params)

In [9]:
physicochemical_prop = get_physicochemical_properties(flaviviridae_seq)

In [11]:
pd.concat([flaviviridae_seq, physicochemical_prop], axis=1)#.to_csv("../../../flaviviridae_props.csv", index=False)

Unnamed: 0,Sequence,Abbreviation,Family,molecular_weight,aromaticity,instability_index,isoelectric_point,helix,turn,sheet,with_reduced_cysteines,with_disulfid_bridges,gravy,net_charge_at_pH7point4
0,AAQRRGRVGRNPNQVGD,HCV,Flaviviridae,1850.9960,0.000000,24.817647,11.999968,0.117647,0.352941,0.117647,0,0,-1.547059,2.609418
1,RNPSQVGD,WNV,Flaviviridae,871.8950,0.000000,48.687500,6.087706,0.125000,0.500000,0.000000,0,0,-1.700000,-0.440856
2,RVGRNPNQVGD,HCV,Flaviviridae,1211.2883,0.000000,-11.454545,9.598972,0.181818,0.454545,0.000000,0,0,-1.545455,0.559118
3,AAQRRGRIGRNPSQVGD,HCV,Flaviviridae,1837.9973,0.000000,56.905882,11.999968,0.117647,0.352941,0.117647,0,0,-1.370588,2.609418
4,RGRRGIYR,HCV,Flaviviridae,1033.1915,0.125000,27.375000,11.999968,0.250000,0.250000,0.000000,1490,1490,-1.950000,3.554847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TWLRAIWDWVCTALTDFK,HCV,Flaviviridae,2225.5658,0.222222,19.505556,5.623957,0.444444,0.000000,0.222222,16500,16500,0.277778,-0.817749
126,SWLRDVWDWVCTVLSDFK,HCV,Flaviviridae,2255.5486,0.222222,27.855556,4.427610,0.500000,0.111111,0.111111,16500,16500,0.088889,-1.772466
127,GAIVSTALPQWRIYSYAG,HCV,Flaviviridae,1953.2003,0.166667,34.566667,8.589719,0.388889,0.277778,0.222222,8480,8480,0.344444,0.552417
128,SWLRDIWDWLCELLSDFK,HCV,Flaviviridae,2325.6385,0.222222,21.316667,4.225377,0.500000,0.111111,0.277778,16500,16500,-0.094444,-2.771346


In [56]:
epoch_50 = pd.read_csv("../../PeptideGAN/gan_for_avps/save/generated_sequences_50.txt", header=None)

In [57]:
epoch_50.columns = ['Sequence']
epoch_50

Unnamed: 0,Sequence
0,SWLRDIWDWICEVLSDFK
1,GELGRLVYLLDGPGYDPIWL
2,SWLRDIWDWICEVLSDFK
3,SWLRDAYLVLGRIIGAL
4,SWLRDIWDWLCEVLSDFK
...,...
9979,RNPYVGRLVVFTRLWRPWTWPHLFSGMDAMFGAKQINQKKEWWVRL...
9980,SWLRDIWDWICEVLSDFK
9981,IFTSGKALHLFSKNLGKLHQIVFRRRTL
9982,SWLRDIWDWICEEVLSDFKM


In [58]:
epoch_50_prop = get_physicochemical_properties(epoch_50)

In [59]:
epoch_50_prop

Unnamed: 0,molecular_weight,aromaticity,instability_index,isoelectric_point,helix,turn,sheet,with_reduced_cysteines,with_disulfid_bridges,gravy,net_charge_at_pH7point4
0,2311.6119,0.222222,14.461111,4.225377,0.500000,0.111111,0.166667,16500,16500,-3.333333e-02,-2.771346
1,2246.5583,0.150000,15.725000,4.050028,0.500000,0.300000,0.300000,8480,8480,2.200000e-01,-2.445570
2,2311.6119,0.222222,14.461111,4.225377,0.500000,0.111111,0.166667,16500,16500,-3.333333e-02,-2.771346
3,1916.2695,0.117647,23.005882,8.460847,0.529412,0.176471,0.352941,6990,6990,9.235294e-01,0.251119
4,2311.6119,0.222222,21.316667,4.225377,0.500000,0.111111,0.222222,16500,16500,-7.222222e-02,-2.771346
...,...,...,...,...,...,...,...,...,...,...,...
9979,7180.4155,0.206897,13.574138,11.036939,0.396552,0.206897,0.206897,30480,30480,-4.758621e-01,7.579077
9980,2311.6119,0.222222,14.461111,4.225377,0.500000,0.111111,0.166667,16500,16500,-3.333333e-02,-2.771346
9981,3281.8982,0.107143,38.428571,11.999968,0.392857,0.178571,0.214286,0,0,-6.344132e-17,5.623114
9982,2571.9219,0.200000,46.615000,4.104708,0.450000,0.100000,0.250000,16500,16500,-1.100000e-01,-3.770225


In [60]:
pd.concat([epoch_50, epoch_50_prop], axis=1)#.to_csv("../../../epoch_50_props.csv", index=False)

In [37]:
epoch_100 = pd.read_csv("../../PeptideGAN/gan_for_avps/data/generated_sequences_leakgan_save_at_epoch.txt")

In [42]:
epoch_100

Unnamed: 0,Sequence
0,KFDSLVECLFSPGKMVHQVFGSANQDWH
1,HCSLAYGDATHQYHCSQGGVWNSLVLYALGWVYMLE
2,SWLRDIWDWGCEVLSDFK
3,SWLRDIWDWICEVLSDTPGKDMV
4,SWLRDIWDWICEVLSDFKTWDWDWICKVLSRFK
...,...
9979,AWDFGSVGVGDGYYMLVESLE
9980,SWLRDIWDWVCEVLSDFK
9981,SWLRDIWDNPSVDMAALVVGWSYH
9982,ACFPWGNQ


In [43]:
epoch_100_prop = get_physicochemical_properties(epoch_100)

In [44]:
epoch_100_prop

Unnamed: 0,molecular_weight,aromaticity,instability_index,isoelectric_point,helix,turn,sheet,with_reduced_cysteines,with_disulfid_bridges,gravy,net_charge_at_pH7point4
0,3207.5943,0.142857,77.753571,5.998696,0.321429,0.250000,0.178571,5500,5500,-0.217857,-1.396795
1,4073.5457,0.166667,38.661111,5.763781,0.388889,0.222222,0.277778,16960,17085,0.141667,-2.388037
2,2255.5056,0.222222,8.700000,4.225377,0.444444,0.166667,0.166667,16500,16500,-0.305556,-2.771346
3,2765.1229,0.130435,13.078261,4.050028,0.391304,0.173913,0.173913,16500,16500,-0.152174,-3.770899
4,4276.8908,0.242424,6.587879,4.854186,0.484848,0.090909,0.121212,33000,33125,-0.190909,-1.799992
...,...,...,...,...,...,...,...,...,...,...,...
9979,2294.4901,0.190476,29.776190,4.050028,0.428571,0.285714,0.285714,8480,8480,0.285714,-4.391982
9980,2297.5853,0.222222,9.744444,4.225377,0.500000,0.111111,0.166667,16500,16500,-0.050000,-2.771346
9981,2818.1240,0.166667,35.200000,4.412093,0.416667,0.250000,0.208333,17990,17990,0.004167,-2.711337
9982,922.0181,0.250000,51.150000,5.561548,0.250000,0.375000,0.125000,5500,5500,-0.350000,-0.416701


In [46]:
pd.concat([epoch_100, epoch_100_prop], axis=1)#.to_csv("../../../epoch_100_props.csv", index=False)

### Di/Tri/Quatro peptide fraction

In [12]:
def dipeptide_present(seq, n):
    aa_list = list(seq)
    return {''.join(aa_grp): int(''.join(aa_grp) in seq) for aa_grp in zip(*[aa_list[i:] for i in range(n)]) if
            not aa_list[0][-1] == (',')}

In [13]:
def reduce_by_kmer_presence(data, kmer=1):
    seq_vec = data.Sequence.apply(lambda x: dipeptide_present(x, kmer)).to_list()
    df = pd.DataFrame(seq_vec)
    df = df.fillna(0)
    #df = df.reindex(columns=get_kmer_list(kmer), fill_value=0)
    df.loc['Fraction'] = df.sum(axis=0)/df.shape[0]
    print(f"Max fraction value: {df.loc['Fraction'].max()}")
    return df#.div(df.sum(axis=1), axis=0)

In [14]:
def drop_column_less_than_threshold(df, threshold=0):
    less_than_threshold_idx = df.loc['Fraction'][df.loc['Fraction'] < threshold].index
    df = df.drop(less_than_threshold_idx, axis=1)
    return df

In [38]:
dd = reduce_by_kmer_presence(epoch_100, kmer=4)

Max fraction value: 0.43669871794871795


In [39]:
dd

Unnamed: 0,KFDS,FDSL,DSLV,SLVE,LVEC,VECL,ECLF,CLFS,LFSP,FSPG,...,MLVE,IWDN,WDNP,DNPS,VDMA,MAAL,AALV,VVGW,GWSY,WSYH
0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9980,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
9981,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000
9982,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
9983,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


In [41]:
aaa = dd.loc['Fraction'].values

In [42]:
aaa.max()

0.43669871794871795

In [43]:
np.percentile(aaa, 99)

0.006310096153846154

In [47]:
drop_column_less_than_threshold(dd, threshold=np.percentile(aaa, 99.9)).loc['Fraction'].to_dict()

{'SWLR': 0.4365985576923077,
 'WLRD': 0.43669871794871795,
 'LRDI': 0.4085536858974359,
 'RDIW': 0.4088541666666667,
 'DIWD': 0.40845352564102566,
 'IWDW': 0.3926282051282051,
 'WDWG': 0.10266426282051282,
 'DWGC': 0.07732371794871795,
 'WGCE': 0.06530448717948718,
 'GCEV': 0.06540464743589744,
 'CEVL': 0.28926282051282054,
 'EVLS': 0.31069711538461536,
 'VLSD': 0.324619391025641,
 'LSDF': 0.3252203525641026,
 'SDFK': 0.3278245192307692,
 'WDWI': 0.2189503205128205,
 'DWIC': 0.19310897435897437,
 'WICE': 0.18068910256410256,
 'ICEV': 0.15795272435897437,
 'RRGR': 0.06830929487179487,
 'RKLR': 0.054286858974358976,
 'VRLG': 0.05669070512820513,
 'RLGR': 0.052884615384615384,
 'LGRY': 0.05128205128205128,
 'GRYL': 0.05278445512820513,
 'RYLL': 0.051682692307692304,
 'ASHL': 0.05178285256410257,
 'SHLR': 0.052383814102564104,
 'HLRK': 0.056991185897435896,
 'LRKL': 0.05538862179487179,
 'LRKR': 0.05148237179487179,
 'RKRL': 0.05228365384615385}

In [47]:
aa_freq_flaviviridae = reduce_by_kmer_frequency(df_flaviviridae, kmer=1)

In [50]:
aa_freq_flaviviridae

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.000000,0.000000,0.000000,0.000000,0.117647,0.058824,0.117647,0.235294,0.000000,0.000000,0.117647,0.000000,0.000000
1,0.000000,0.000000,0.125000,0.000000,0.000000,0.125000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.125000,0.125000,0.125000,0.000000,0.125000,0.000000,0.000000
2,0.000000,0.000000,0.090909,0.000000,0.000000,0.181818,0.000000,0.000000,0.000000,0.000000,0.000000,0.181818,0.090909,0.090909,0.181818,0.000000,0.000000,0.181818,0.000000,0.000000
3,0.117647,0.000000,0.058824,0.000000,0.000000,0.176471,0.000000,0.058824,0.000000,0.000000,0.000000,0.058824,0.058824,0.117647,0.235294,0.058824,0.000000,0.058824,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.125000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,0.111111,0.055556,0.111111,0.000000,0.055556,0.000000,0.000000,0.055556,0.055556,0.111111,0.000000,0.000000,0.000000,0.000000,0.055556,0.000000,0.166667,0.055556,0.166667,0.000000
126,0.000000,0.055556,0.166667,0.000000,0.055556,0.000000,0.000000,0.000000,0.055556,0.111111,0.000000,0.000000,0.000000,0.000000,0.055556,0.111111,0.055556,0.166667,0.166667,0.000000
127,0.166667,0.000000,0.000000,0.000000,0.000000,0.111111,0.000000,0.111111,0.000000,0.055556,0.000000,0.000000,0.055556,0.055556,0.055556,0.111111,0.055556,0.055556,0.055556,0.111111
128,0.000000,0.055556,0.166667,0.055556,0.055556,0.000000,0.000000,0.055556,0.055556,0.222222,0.000000,0.000000,0.000000,0.000000,0.055556,0.111111,0.000000,0.000000,0.166667,0.000000
