In [2]:
import joypy
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter

In [3]:
pwd

'/Users/in-divye.singh/Documents/Projects/PeptideGAN/antiviral-peptide-predictions-using-gan/notebooks'

In [4]:
data = pd.read_csv("../data/raw/avp_sequences_tw_avpdb_avppred.csv")

In [5]:
data

Unnamed: 0,Sequence
0,FLPLIGRVLSGIL
1,VVCACRRALCLPRERRAGFCRIRGRIHPLCCRR
2,RRCICTTRTCRFPYRRLGTCLFQNRVYTFCC
3,GRFKRFRKKFKKLFKKLSPVIPLLHLG
4,GIGTKILGGVKTALKGALKELASTYAN
...,...
2305,KRWRKRWRKWRWRKRWRK
2306,RTRKRGRKRTRKRGRK
2307,RGGKIAGKIAKIAGKIAKIAGKIA
2308,KDLLFK


In [6]:
to_drop = [i for i, s in enumerate(data.Sequence) if ' ' in s]

In [7]:
data = data.drop(to_drop, axis=0)

In [8]:
def dipeptide_encoding(seq, n):
    """
    Returns n-Gram Motif frequency
    https://www.biorxiv.org/content/10.1101/170407v1.full.pdf
    """
    aa_list = list(seq)
    return {''.join(aa_list): n for aa_list, n in Counter(zip(*[aa_list[i:] for i in range(n)])).items() if
            not aa_list[0][-1] == (',')}

In [9]:
seq_vec = data.Sequence.apply(lambda x: dipeptide_encoding(x, 1)).to_list()
df = pd.DataFrame(seq_vec)
df = df.fillna(0)

In [10]:
df

Unnamed: 0,F,L,P,I,G,R,V,S,C,A,E,H,T,Y,Q,N,K,M,D,W
0,1.0,4.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,3.0,2.0,2.0,2.0,10.0,2.0,0.0,6.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,2.0,1.0,1.0,1.0,7.0,1.0,0.0,6.0,0.0,0.0,0.0,5.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
3,4.0,5.0,2.0,1.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
4,0.0,4.0,0.0,2.0,5.0,0.0,1.0,1.0,0.0,4.0,1.0,0.0,3.0,1.0,0.0,1.0,4.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0
2306,0.0,0.0,0.0,0.0,2.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
2307,0.0,0.0,0.0,6.0,5.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0
2308,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0


In [11]:
df.columns, len(df.columns)

(Index(['F', 'L', 'P', 'I', 'G', 'R', 'V', 'S', 'C', 'A', 'E', 'H', 'T', 'Y',
        'Q', 'N', 'K', 'M', 'D', 'W'],
       dtype='object'),
 20)

In [13]:
import matplotlib as mpl
import matplotlib.pylab as plt

In [14]:
cmap = plt.cm.jet  # define the colormap
# extract all colors from the .jet map
cmaplist = [cmap(i) for i in range(cmap.N)]
# force the first color entry to be grey
cmaplist = cmaplist[1:]

# create the new map
cmap = mpl.colors.LinearSegmentedColormap.from_list(
    'Custom cmap', cmaplist, cmap.N)

In [15]:
df = df.sort_index(axis=1)

In [15]:
%matplotlib

Using matplotlib backend: MacOSX


In [16]:
fig, axes = joypy.joyplot(df, column=list(df.columns), figsize=(8, 8), fade=True, colormap=cmap,
                          x_range=range(int(df.quantile(0.90).max())), grid=True, ylabelsize=15)

#### Fraction of AA per sequence

In [16]:
df_fraction = df.div(df.sum(axis=1), axis=0)

In [17]:
df_fraction

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.000000,0.000000,0.000000,0.000000,0.076923,0.153846,0.000000,0.153846,0.000000,0.307692,0.0,0.000000,0.076923,0.000000,0.076923,0.076923,0.000000,0.076923,0.000000,0.000000
1,0.090909,0.181818,0.000000,0.030303,0.030303,0.060606,0.030303,0.060606,0.000000,0.090909,0.0,0.000000,0.060606,0.000000,0.303030,0.000000,0.000000,0.060606,0.000000,0.000000
2,0.000000,0.193548,0.000000,0.000000,0.096774,0.032258,0.000000,0.032258,0.000000,0.064516,0.0,0.032258,0.032258,0.032258,0.225806,0.000000,0.161290,0.032258,0.000000,0.064516
3,0.000000,0.000000,0.000000,0.000000,0.148148,0.074074,0.037037,0.037037,0.259259,0.185185,0.0,0.000000,0.074074,0.000000,0.111111,0.037037,0.000000,0.037037,0.000000,0.000000
4,0.148148,0.000000,0.000000,0.037037,0.000000,0.185185,0.000000,0.074074,0.148148,0.148148,0.0,0.037037,0.000000,0.000000,0.000000,0.037037,0.111111,0.037037,0.000000,0.037037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2305,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.277778,0.000000,0.0,0.000000,0.000000,0.000000,0.444444,0.000000,0.000000,0.000000,0.277778,0.000000
2306,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.000000,0.000000,0.250000,0.000000,0.0,0.000000,0.000000,0.000000,0.500000,0.000000,0.125000,0.000000,0.000000,0.000000
2307,0.250000,0.000000,0.000000,0.000000,0.000000,0.208333,0.000000,0.250000,0.250000,0.000000,0.0,0.000000,0.000000,0.000000,0.041667,0.000000,0.000000,0.000000,0.000000,0.000000
2308,0.000000,0.000000,0.166667,0.000000,0.166667,0.000000,0.000000,0.000000,0.333333,0.333333,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
fig, axes = joypy.joyplot(df_fraction, column=list(df_fraction.columns), figsize=(8, 8), fade=True, colormap=cmap,
                          x_range=np.arange(0, df_fraction.quantile(0.90).max(), 0.01), grid=True, ylabelsize=15)

In [None]:
df_fraction.quantile(0.90).max()

In [18]:
df_fraction.to_csv("../../../aa_propensity_fraction.csv", index=False)

In [20]:
df.to_csv("../../../aa_propensity.csv", index=False)

In [21]:
import seaborn as sns

In [23]:
%matplotlib

Using matplotlib backend: MacOSX


In [27]:
sns.boxplot(data=df_fraction*100)
plt.ylabel("Amino Acid %")

Text(0, 0.5, 'Amino Acid %')