# Assignment 1: Dataset Description and Exploration

(c) 2021 Tom Röschinger and Linlin Chen. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

***

In [1]:
from tdc.multi_pred import PPI
import pandas as pd
import numpy as np

import tensorflow as tf
import sklearn

import iqplot

import panel as pn

import bokeh.io
import bokeh.plotting

pn.extension()
bokeh.io.output_notebook()

### Dataset Description

Source: https://tdcommons.ai/multi_pred_tasks/ppi/

Literature:



Goal: Regression. Given the gene expression of cell lines and the SMILES of drug, predict the drug sensitivity level.

Let's have a look at the data set!

In [24]:
data = PPI(name = 'HuRI')
data = data.neg_sample(frac=1)
split = data.get_split()
df = split['train']
#df.rename(columns={"Y" :"log IC50"}, inplace=True)
df.head()

Found local copy...
Loading...
Done!


Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1
2,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000105383,MPLLLLLPLLWAGALAMDPNFWLQVQESVTVQEGLCVLVPCTFFHP...,1
3,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000124103,MWTLKSSLVLLLCLTCSYAFMFSSLRQKTSEPQGKVQYGEHFRIRQ...,1
4,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000139637,MGHQFLRGLLTLLLPPPPLYTRHRMLGPESVPPPKRSRSKLMAPPR...,1


### Number of amino acids per sequence

In [70]:
protein_lengths = np.array([len(seq) for seq in df['Protein1'].unique()])

bokeh.io.show(
    iqplot.ecdf(data=protein_lengths, title="Number of Amino Acids per Sequence")
)

### Number of Proteins per sequence

In [72]:
protein_numbers = np.array([seq.count("*") for seq in df['Protein1'].unique()])

bokeh.io.show(
    iqplot.ecdf(data=protein_numbers, title='Number of Proteins per Sequence')
)

In [73]:
p = bokeh.plotting.figure(frame_width=400, frame_height=300)

p.scatter(protein_numbers, protein_lengths)

bokeh.io.show(p)

### One-hot encoding of Amino Acid Sequences

In [25]:
AA_dict = {
    '*':0,'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,
    'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20, 'X':21, 'U':22
}

In [26]:
def one_hot_protein(seq_list):
    L = np.max([len(seq) for seq in seq_list])
    matrix_list = []
    
    for seq in seq_list:
        matrix = np.zeros([23, L])
        for i in range(len(seq)):
            matrix[AA_dict[seq[i]], i] = 1
        matrix_list.append(matrix)
    return matrix_list

df['Protein1_one_hot'] = one_hot_protein(df['Protein1'])
df['Protein2_one_hot'] = one_hot_protein(df['Protein2'])

In [27]:
df.head()

Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y,Protein1_one_hot,Protein2_one_hot
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000105383,MPLLLLLPLLWAGALAMDPNFWLQVQESVTVQEGLCVLVPCTFFHP...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000124103,MWTLKSSLVLLLCLTCSYAFMFSSLRQKTSEPQGKVQYGEHFRIRQ...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000139637,MGHQFLRGLLTLLLPPPPLYTRHRMLGPESVPPPKRSRSKLMAPPR...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


## EDA

First, let's compute how many interactions each proteins has on average.

In [35]:
tested_interactions = df.groupby('Protein1_ID').agg(['mean', 'count']).reset_index()
tested_interactions.columns = tested_interactions.columns.droplevel()
tested_interactions.head()

Unnamed: 0,Unnamed: 1,mean,count
0,ENSG00000000005,0.8125,16
1,ENSG00000000419,0.166667,6
2,ENSG00000000457,0.5,8
3,ENSG00000000460,0.333333,3
4,ENSG00000001036,0.6,15


In [37]:
bokeh.io.show(
    iqplot.ecdf(data=tested_interactions, q="mean", x_axis_label="Interactions", title="Mean Number of positive interactions per Protein")
)

In [38]:
bokeh.io.show(
    iqplot.ecdf(data=tested_interactions, q="count", x_axis_label="Interactions", title="Mean Number of positive interactions per Protein")
)

In [69]:
p = bokeh.plotting.figure(frame_width=400, frame_height=300)

p.scatter(tested_interactions['count'], tested_interactions['mean'])

bokeh.io.show(p)

Now let's compute how the number of proteins per gene relates to the number of interactions.

In [41]:
def count_stars(seqs1, seqs2):
    interactions = np.zeros(len(seqs1))
    for i in range(len(seqs1)):
        interactions[i] = seqs1[i].count("*") * seqs2[i].count("*")
        
    return interactions

df['possible_interactions'] = count_stars(df['Protein1'], df['Protein2'])

total_interactions = df.groupby('possible_interactions').mean().reset_index()
total_interactions.head()

Unnamed: 0,possible_interactions,Y
0,0.0,0.0
1,1.0,0.601385
2,2.0,0.534085
3,3.0,0.511369
4,4.0,0.507717


In [43]:
p = bokeh.plotting.figure()

p.scatter(total_interactions['possible_interactions'], total_interactions['Y'])

bokeh.io.show(p)

### Tensorflow

In [44]:
classes = df['Y'].values[0:1000]
p1 = df['Protein1_one_hot'].values[0:1000]
p2 = df['Protein2_one_hot'].values[0:1000]


Here we need to stack the array into a a single one with an extra dimension. However, if I try, the notebook dies :(

In [46]:
# This kills my notebook (probably too much memory needed)
np.stack(p1).shape

(1000, 23, 33472)

In [None]:
flow_ob = tf.data.Dataset.from_tensor_slices((p1, p2_stack, classes))

In [None]:
flow_ob.take(1)

## Computing Environment

In [None]:
%load_ext watermark
%watermark -v -p tdc,pandas,numpy,tensorflow,sklearn,iqplot,panel,bokeh