# Assignment 1: Dataset Description and Exploration

(c) 2021 Tom Röschinger and Linlin Chen. This work is licensed under a [Creative Commons Attribution License CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/). All code contained herein is licensed under an [MIT license](https://opensource.org/licenses/MIT).

***

In [11]:
from tdc.multi_pred import PPI
import pandas as pd
import numpy as np

import tensorflow as tf
import sklearn

import iqplot

import panel as pn

import bokeh.io
import bokeh.plotting

pn.extension()
bokeh.io.output_notebook()

### Dataset Description

Source: https://tdcommons.ai/multi_pred_tasks/ppi/

Literature:



Goal: Regression. Given the gene expression of cell lines and the SMILES of drug, predict the drug sensitivity level.

Let's have a look at the data set!

In [12]:
data = PPI(name = 'HuRI')
data = data.neg_sample(frac=1)
split = data.get_split()
df = split['train']
#df.rename(columns={"Y" :"log IC50"}, inplace=True)
df.head()

Found local copy...
Loading...
Done!


Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1
2,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000105383,MPLLLLLPLLWAGALAMDPNFWLQVQESVTVQEGLCVLVPCTFFHP...,1
3,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000124103,MWTLKSSLVLLLCLTCSYAFMFSSLRQKTSEPQGKVQYGEHFRIRQ...,1
4,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000139637,MGHQFLRGLLTLLLPPPPLYTRHRMLGPESVPPPKRSRSKLMAPPR...,1


### One-hot encoding of Amino Acid Sequences

In [13]:
AA_dict = {
    '*':0,'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,
    'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20, 'X':21, 'U':22
}

In [14]:
def one_hot_protein(seq_list):
    L = np.max([len(seq) for seq in seq_list])
    matrix_list = []
    
    for seq in seq_list:
        matrix = np.zeros([23, L])
        for i in range(len(seq)):
            matrix[AA_dict[seq[i]], i] = 1
        matrix_list.append(matrix)
    return matrix_list

df['Protein1_one_hot'] = one_hot_protein(df['Protein1'])
df['Protein2_one_hot'] = one_hot_protein(df['Protein2'])

In [15]:
df.head()

Unnamed: 0,Protein1_ID,Protein1,Protein2_ID,Protein2,Y,Protein1_one_hot,Protein2_one_hot
0,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000061656,MRRSSRPGSASSSRKHTPNFFSENSSMSITSEDSKGLRSAEPGPGE...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000104765,MSSHLVEPPPPLHNNNNNCEENEQSLPPPAGLNSSWVELPMNSSNG...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000105383,MPLLLLLPLLWAGALAMDPNFWLQVQESVTVQEGLCVLVPCTFFHP...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000124103,MWTLKSSLVLLLCLTCSYAFMFSSLRQKTSEPQGKVQYGEHFRIRQ...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,ENSG00000000005,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,ENSG00000139637,MGHQFLRGLLTLLLPPPPLYTRHRMLGPESVPPPKRSRSKLMAPPR...,1,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


### Tensorflow

In [16]:
classes = df['Y'].values
p1 = df['Protein1_one_hot'].values
p2 = df['Protein2_one_hot'].values


Here we need to stack the array into a a single one with an extra dimension. However, if I try, the notebook dies :(

In [None]:
# This kills my notebook (probably too much memory needed)
np.stack(p1)

In [9]:
flow_ob = tf.data.Dataset.from_tensor_slices((p1, p2_stack, classes))

NameError: name 'p2_stack' is not defined

In [None]:
flow_ob.take(1)

## Computing Environment

In [None]:
%load_ext watermark
%watermark -v -p tdc,pandas,numpy,tensorflow,sklearn,iqplot,panel,bokeh