# Combining omics for classification of phenotypes

Ritchie et al. (Methods of integrating data to uncover genotype-phenotype interactions) describe the following ways to integrate multi-omic data

* Pathway or knowledge-based integration:
* Concatenation-based: combine all datasets
* Model-based: create models per datasets, then combine models
* Transformation-based:..

We present a fourth and a fifth multi-omic method
* Reduced normalised concatenation
* Model-based inter-omic transformation

Per sub-omic we collect important features, by
* comparing the non-parametric distributions over the different classifications
* simply counting the occurrences and setting a cut-off point
* using the importances of the classification models as filters
* checking the summed weights of linear and non-linear dimensionality reducers

We then have the choice to collect these features
*  greedily: all remaining sub-omic features
*  non-greedily: only overlapping features (by gene)

To find inter **and** intra-omic connections we can resort to a similarity measure. 

Suggestions:
* provide ontologies to better interpret the data: dictionaries connecting genes, proteins, metabolism, etc.
* add immune-system measurements (Leukocytes, etc.) 
* omic data from healthy patients

## Load libraries..

In [1]:
import seaborn as sns
from ggplot import *
from matplotlib import pyplot as plt
import bokeh

import pandas as pd
import dask.dataframe as dd
import numpy as np
import scipy as sc
import statsmodels as sm
import networkx as nx

import sklearn as sk
import tensorflow as tf
import keras
import lightgbm as lgbm
import tpot

import sys
import os
import gc


pd.options.display.max_rows = 30
pd.options.display.max_columns = 50
pd.options.display.float_format = '{:.1f}'.format

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load in data..

In [2]:
def read_table(name, loc="gc"):
    if loc=="gc":
        file_root = "https://storage.googleapis.com/genx_2018/"
    elif loc=="local":
        file_root = "/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2/Melanoma/"
    return pd.read_table(file_root + name, sep="\t")

In [6]:
def _clean(x, default='float'):   
    non_default = 'int' if default=='float' else 'float'
    try:
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        x.dropna(how='all', axis=1, inplace=True)
        if default=='float':
            x = x * 1.0
        else:
            x = x * 1
    except Exception as e:
        print(e)
        for col in tqdm.tqdm(x.columns):
            if 'object' in str(x[col].dtypes):
                try:
                    x[col] = x[col].astype(default)
                except:
                    try:
                        x[col] = x[col].astype(non_default)
                    except:
                        print(col)
                        x[col] = x[col].astype('category')
    return x
      

In [51]:
data_methylation = read_table("Melanoma_Methylation.txt", loc="local")

In [50]:
data_mutation = read_table("Melanoma_Mutation.txt", loc="local")
data_cnv = read_table("Melanoma_CNV.txt", loc="local")
data_RNA = read_table("Melanoma_GeneExpression.txt", loc="local")
data_miRNA = read_table("Melanoma_miRNA.txt", loc="local")
data_protein = read_table("Melanoma_Proteome.txt", loc="local")

  if self.run_code(code, result):
  if self.run_code(code, result):


## create sub-omics 

### Methylation data

In [52]:
data_methylation = data_methylation[np.isfinite(data_methylation.Start)]
data_methylation = data_methylation[np.isfinite(data_methylation.Stop)]

data_methylation.Start = data_methylation.Start.astype(int).astype(str)
data_methylation.Stop = data_methylation.Stop.astype(int).astype(str)
data_methylation.Chr = data_methylation.Chr.astype(str)
data_methylation.Gene = data_methylation.Gene.astype(str)

data_methylation['GenX'] = data_methylation[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
data_methylation.drop('probeID', axis=1, inplace=True)
data_methylation.drop(['Chr', 'Start', 'Stop', 'Gene'], axis=1, inplace=True)
data_methylation.dropna(thresh=4, axis=0, inplace=True)

In [53]:
def get_transposed(df, NameRow='GenX', prefix='GenX'):
    transposed  = df.T
    new_index = transposed.loc[[NameRow]].values.tolist()[0]
    transposed.columns = new_index
    if prefix is not None:
        transposed.columns = [prefix+'_'+_col for _col in transposed.columns.values.tolist()]
    return transposed.drop(NameRow, axis=0, inplace=False)

In [55]:
dict_methylation={'Methylation_Strand_plus_CpG_Island': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='Island')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_plus_CpG_Nshelf': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='N_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_plus_CpG_Nshore': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='N_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),                  
                  'Methylation_Strand_plus_CpG_Sshelf': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='S_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_plus_CpG_Sshore': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island=='S_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_plus_CpG_NaN': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="+")
                            & (data_methylation.Relation_CpG_Island.isna())]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),  
                  'Methylation_Strand_min_CpG_Island': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='Island')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_min_CpG_Nshelf': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='N_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_min_CpG_Nshore': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='N_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),                  
                  'Methylation_Strand_min_CpG_Sshelf': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='S_Shelf')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_min_CpG_Sshore': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island=='S_Shore')]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),
                  'Methylation_Strand_min_CpG_NaN': _clean(get_transposed(data_methylation.loc[(data_methylation.Strand=="-")
                            & (data_methylation.Relation_CpG_Island.isna())]\
                              .drop(['Strand', 'Relation_CpG_Island'], axis=1))),                    
                 }
del data_methylation
gc.collect()

41

### Mutation data

In [56]:
data_mutation.Start = data_mutation.Start.astype(int)
data_mutation.Stop = data_mutation.Stop.astype(int)
data_mutation = data_mutation[np.isfinite(data_mutation.Start)]
data_mutation = data_mutation[np.isfinite(data_mutation.Stop)]

data_mutation.Start = data_mutation.Start.astype(str)
data_mutation.Stop = data_mutation.Stop.astype(str)
data_mutation.Chr = data_mutation.Chr.astype(str)
data_mutation.Gene = data_mutation.Gene.astype(str)

data_mutation['GenX'] = data_mutation[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)

In [57]:
_map_mutation = data_mutation[['GenX', 'Ref', 'Alt', 'Amino_Acid_Change', 'Effect']].drop_duplicates()


In [58]:
data_mutation.drop(['Chr', 'Start', 'Stop', 'DNA_VAF', 
                    'RNA_VAF', 'Amino_Acid_Change', 'Ref', 'Alt'], axis=1, inplace=True)

In [59]:
rare_effects = ['Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 
                'In_Frame_Ins', 'Nonstop_Mutation', 'Translation_Start_Site']
intermediate_effects = ['Splice_Site', 'Nonsense_Mutation']
common_effects = ['Missense_Mutation', 'Silent']

In [40]:
def cat_encode(src, target, col, unique_genes=None):
    cols = pd.get_dummies(src[col], prefix=col, prefix_sep='_')
    if unique_genes is not None:
        unique_genes = [col+'_'+gene for gene in unique_genes]
        cols = cols[unique_genes]
    return pd.concat([target, cols], axis=1)

def drop_nan_rows(table, col):
    res = table.dropna(axis=0, how='any', subset=[col])
    print('Dropping %d nan rows for %s, sz before: %d, sz after: %d' % (len(table)-len(res), col, len(table), len(res)))
    return res

def filter(table, col, value):
    res = table.loc[table[col] != value]
    print('Filtering %d rows with "%s" for %s, sz before: %d, sz after: %d' % (len(table) - len(res), value, col, len(table), len(res)))
    return res

def make_x(table, groups, count_filter = 0, greedy=False):
    mg = table \
            .groupby(groups) \
            .size() \
            .reset_index(name='Count') \
            .sort_values(['Count'], ascending=False)
    print('Most mutated genes within {}:\n'.format(groups))
    print(mg.head(10))
        
    # Join groups: Gene_AADACL3|chr1|Translation_Start_Site
    tmp = pd.DataFrame()
    tmp['Sample'] = table['Sample']
    tmp['GenX'] = table[groups].apply(lambda x: '|'.join(x).strip(), axis=1)
    
    if count_filter > 0:
        mg['GenX'] = mg[groups].apply(lambda x: '|'.join(x).strip(), axis=1)
        mg = mg.loc[mg.Count > count_filter]
        unique_genes = mg.loc[mg.Count > count_filter]['GenX']  
        
        print('\nUsing count_filter > %d' % count_filter)
        print('Before: %d' % len(tmp))
        if greedy==False:
            tmp = tmp.loc[tmp.GenX.isin(unique_genes)]
        print('After: %d' % len(unique_genes))
        
    print('\nUnique features: %d' % len(tmp['GenX'].unique()))
    
    # Extract features
    tmp = cat_encode(tmp, tmp, 'GenX', unique_genes=unique_genes)
    tmp = tmp.loc[:, tmp.columns != 'GenX']
    
    # Group by Sample
    tmp = tmp.groupby('Sample', as_index=False).sum()
 
    # Join targets
    #tmp = tmp.merge(pmt, how='inner', on='Sample')
    
    #y = tmp[target_variable]
    #print('Target: ', y.shape)
    
    tmp = tmp.set_index('Sample')
    #tmp = tmp.loc[:, tmp.columns != 'Sample']
    #tmp = tmp.loc[:, tmp.columns != target_variable]
    x = tmp
    print('Features: ', x.shape)
    
    return x

In [39]:
rare_count = 5
inter_count= 15
common_count = 30
dict_mutation={'Mutation_Rare': _clean(make_x(data_mutation.loc[data_mutation.Effect.isin(rare_effects)].drop(['Effect'], axis=1),
                             ['Gene'], count_filter=rare_count, greedy=True)),
               'Mutation_Intermediate': _clean(make_x(data_mutation.loc[data_mutation.Effect.isin(intermediate_effects)].drop(['Effect'],
                                 axis=1), ['Gene'], count_filter=inter_count, greedy=True)),
               'Mutation_Common': _clean(make_x(data_mutation.loc[data_mutation.Effect.isin(common_effects)].drop(['Effect'], 
                               axis=1), ['Gene'], count_filter=common_count, greedy=True))                                    
                 }
del data_mutation
gc.collect()

Most mutated genes within ['Gene']:

        Gene  Count
2108    PTEN     16
483   CDKN2A     12
1740  NOTCH2      9
1780  NUDT11      8
2243   RRP36      8
1793   OBSCN      8
2762     TTN      8
463    CDC27      8
2546    SUCO      8
188    ARID2      7

Using count_filter > 5
Before: 3954
After: 3954

Unique features: 3031
Features:  (469, 17)
Most mutated genes within ['Gene']:

       Gene  Count
8590    TTN    168
2294  DNAH5     82
5223    NF1     63
4508  LRP1B     50
2296  DNAH7     49
5009  MUC16     47
2297  DNAH8     47
585   ARID2     41
1946  CSMD3     38
1944  CSMD1     37

Using count_filter > 15
Before: 25501
After: 25501

Unique features: 9382
Features:  (466, 92)
Most mutated genes within ['Gene']:

        Gene  Count
16241    TTN   3080
9435   MUC16   2238
4184   DNAH5    886
11048   PCLO    726
655     ANK3    499
8400   LRP1B    491
3522   CSMD1    483
6248   GPR98    457
4186   DNAH7    437
3523   CSMD2    434

Using count_filter > 30
Before: 377456
After: 3774

457

### CNV data

In [60]:
try:
    data_cnv.Start = data_cnv.Start.astype(int)
    data_cnv.Stop = data_cnv.Stop.astype(int)
    data_cnv = data_cnv[np.isfinite(data_cnv.Start)]
    data_cnv = data_cnv[np.isfinite(data_cnv.Stop)]
except:
    data_cnv = data_cnv[np.isfinite(data_cnv.Start)]
    data_cnv = data_cnv[np.isfinite(data_cnv.Stop)]

data_cnv.Start = data_cnv.Start.astype(str)
data_cnv.Stop = data_cnv.Stop.astype(str)
data_cnv.Chr = data_cnv.Chr.astype(str)
data_cnv.Gene = data_cnv.Gene.astype(str)

data_cnv['GenX'] = data_cnv[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_cnv = data_cnv[['Gene', 'GenX']]
data_cnv = data_cnv.drop(['Gene', 'Chr', 'Start', 'Stop'], axis=1)

In [61]:
dict_cnv={'CNV_StrandPlus': get_transposed(data_cnv.loc[data_cnv.Strand=='+'].drop(['Strand'], axis=1)),
          'CNV_StrandMin':  get_transposed(data_cnv.loc[data_cnv.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_cnv
gc.collect()

130

### Gene expression data

In [63]:
data_RNA = data_RNA[np.isfinite(data_RNA.Start)]
data_RNA = data_RNA[np.isfinite(data_RNA.Stop)]

data_RNA.Start = data_RNA.Start.astype(int).astype(str)
data_RNA.Stop = data_RNA.Stop.astype(int).astype(str)
data_RNA.Chr = data_RNA.Chr.astype(str)
data_RNA.Gene = data_RNA.Gene.astype(str)

data_RNA['GenX'] = data_RNA[['Gene', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_RNA = data_RNA[['Gene', 'GenX']]
data_RNA = data_RNA.drop(['Gene', 'Chr', 'Start', 'Stop'], axis=1)


In [64]:
dict_RNA={'RNA_StrandPlus': get_transposed(data_RNA.loc[data_RNA.Strand=='+'].drop(['Strand'], axis=1)),
          'RNA_StrandMin':  get_transposed(data_RNA.loc[data_RNA.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_RNA
gc.collect()

156

### miRNA data

In [65]:
data_miRNA = data_miRNA[np.isfinite(data_miRNA.Start)]
data_miRNA = data_miRNA[np.isfinite(data_miRNA.Stop)]

data_miRNA.Start = data_miRNA.Start.astype(int).astype(str)
data_miRNA.Stop = data_miRNA.Stop.astype(int).astype(str)
data_miRNA.Chr = data_miRNA.Chr.astype(str)
data_miRNA.Name = data_miRNA.Name.astype(str)

data_miRNA['GenX'] = data_miRNA[['Name', 'Chr', 'Start', 'Stop']].apply(lambda x: '.'.join(x), axis=1)
_map_miRNA = data_miRNA[['Name', 'GenX']]
data_miRNA = data_miRNA.drop(['MIMATID', 'Name', 'Chr', 'Start', 'Stop'], axis=1)


In [66]:
dict_miRNA={'miRNA_StrandPlus': get_transposed(data_miRNA.loc[data_miRNA.Strand=='+'].drop(['Strand'], axis=1)),
          'miRNA_StrandMin':  get_transposed(data_miRNA.loc[data_miRNA.Strand=='-'].drop(['Strand'], axis=1))                                
          }
del data_miRNA
gc.collect()

102

### Proteomic data

In [67]:
data_protein = get_transposed(data_protein, NameRow='ProteinID', prefix="Protein")
data_protein = _clean(data_protein, default='float')

# Merging with phenotype data

In [74]:
data_phenotype = read_table("Melanoma_Phenotype_Metadata.txt", loc="gc")

In [96]:
target_variable = 'Response To Therapy'
target_map = {
 "Complete Response":1,
 "Clinical Progressive Disease":0,        
 "Radiographic Progressive Disease":0,    
 "Stable Disease":0,                      
 "Partial Response":1                    
}
conditions = {
    "Drug Therapy Type": "Immunotherapy",
    "Vital Status": "*",
    "Gender": "*"     
}

data_phenotype["target"] = data_phenotype[target_variable].apply(lambda x: np.nan if pd.isnull(x) else target_map[x])
print(data_phenotype[["target", "Drug Therapy Type", "SampleID"]].groupby(by=["target", "Drug Therapy Type"]).count())

data_phenotype_to_merge = data_phenotype.loc[~data_phenotype.target.isna()][["target", "SampleID"]]
data_phenotype_to_merge.target = data_phenotype_to_merge.target.astype(int)

                                   SampleID
target Drug Therapy Type                   
0.0    Chemotherapy                      14
       Immunotherapy                     19
       Targeted Molecular therapy         2
       Vaccine                            6
1.0    Ancillary                          3
       Chemotherapy                      13
       Hormone Therapy                    1
       Immunotherapy                     19
       Vaccine                            2


## sub-omics

In [98]:
merged = {}
for key in dict_RNA.keys():
    merged[key] = dict_RNA[key].merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

for key in dict_cnv.keys():
    merged[key] = dict_cnv[key].merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

for key in dict_methylation.keys():
    merged[key] = dict_methylation[key].merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

for key in dict_miRNA.keys():
    merged[key] = dict_miRNA[key].merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

for key in dict_mutation.keys():
    merged[key] = dict_mutation[key].merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

merged['protein'] = data_protein.merge(data_phenotype_to_merge, how='inner', left_index=True, right_on='SampleID')

## Dimension reduction

# Classification

In [70]:
from xgboost import XGBClassifier as xgb
from lightgbm import LGBMClassifier as lgbm
from sklearn import metrics, model_selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA as pca
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
from sklearn.feature_selection import SelectFdr as fdr
from sklearn.manifold import TSNE
import umap

In [69]:
def benchmark_classifier(clf,x,y,splitter):
    splitter.random_state = 111
    pred = np.zeros(shape=y.shape)

    for train_index, test_index in splitter.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index] 

        clf.fit(x_train,y_train)
        pred[test_index] = clf.predict(x_test)
        
        print(metrics.accuracy_score(y_test,pred[test_index]))
        print(metrics.confusion_matrix(y_test,pred[test_index]))

    return pred

### Per sub omic

## Combined

## Prediction on non-chemo, non-immuno metastastis patients

# Feature analysis

## Ontological

## Parallel Coordinates

## Lower dimensional embedding

## Clustering