# Mutations

In [2]:
import sys
!{sys.executable} -m pip install ggplot
!{sys.executable} -m pip install lightgbm



In [0]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 20
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:.1f}'.format

from sklearn import preprocessing

from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import gc

In [0]:
def read_table(name):
    file_root = "https://storage.googleapis.com/genx_2018/"
    return pd.read_table(file_root + name, sep="\t")

def cat_encode(src, target, col):
    cols = pd.get_dummies(src[col], prefix=col, prefix_sep='_')
    return pd.concat([target, cols], axis=1)

def drop_nan_rows(table, col):
    res = table.dropna(axis=0, how='any', subset=[col])
    print('Dropping %d nan rows for %s, sz before: %d, sz after: %d' % (len(table)-len(res), col, len(table), len(res)))
    return res

def filter(table, col, value):
    res = table.loc[table[col] != value]
    print('Filtering %d rows with "%s" for %s, sz before: %d, sz after: %d' % (len(table) - len(res), value, col, len(table), len(res)))
    return res

phenotype_meta_table = read_table("Melanoma_Phenotype_Metadata.txt")
mutation_table = read_table("Melanoma_Mutation.txt")

### Prepare target

In [5]:
target_variable = 'Response To Therapy'
target_map = {
 "Complete Response": 0,
 "Clinical Progressive Disease": 1,
 "Radiographic Progressive Disease": 1,    
 "Stable Disease": 1,                      
 "Partial Response": 0   
}

pmt = pd.DataFrame()

pmt['Sample'] = phenotype_meta_table['SampleID']
pmt['Response To Therapy'] = phenotype_meta_table['Response To Therapy']
pmt = drop_nan_rows(pmt, 'Response To Therapy')
pmt['Response To Therapy'] = pmt['Response To Therapy'].apply(lambda x: target_map[x])

pmt[['Sample', target_variable]].groupby(by=target_variable, as_index=False).count()

Dropping 375 nan rows for Response To Therapy, sz before: 476, sz after: 101


Unnamed: 0,Response To Therapy,Sample
0,0,51
1,1,50


In [6]:
pmt.head(3)

Unnamed: 0,Sample,Response To Therapy
27,TCGA-D3-A1Q1-06,1
28,TCGA-D3-A1Q3-06,0
35,TCGA-D3-A1Q9-06,1


### Features

In [7]:
mutation_table.Effect.unique()

array(['Missense_Mutation', 'Silent', 'RNA', 'Translation_Start_Site',
       'Splice_Site', 'Nonsense_Mutation', 'Nonstop_Mutation',
       'Frame_Shift_Del', 'In_Frame_Ins', 'Frame_Shift_Ins',
       'In_Frame_Del'], dtype=object)

In [8]:
# Filter Silent/RNA
mutation_table = filter(mutation_table, 'Effect', 'Silent')
mutation_table = filter(mutation_table, 'Effect', 'RNA')

Filtering 136359 rows with "Silent" for Effect, sz before: 422553, sz after: 286194
Filtering 15642 rows with "RNA" for Effect, sz before: 286194, sz after: 270552


In [9]:
# Join tables
mutation_table = mutation_table.merge(pmt, how='inner', on='Sample')
mutation_table.head(3)

Unnamed: 0,Sample,Chr,Start,Stop,Ref,Alt,Gene,Effect,DNA_VAF,RNA_VAF,Amino_Acid_Change,Response To Therapy
0,TCGA-D3-A3ML-06,chr5,140182973,140182973,G,A,PCDHA3,Missense_Mutation,0.5,,p.D731N,1
1,TCGA-D3-A3ML-06,chr2,133541884,133541884,C,T,NCKAP5,Missense_Mutation,0.5,,p.E834K,1
2,TCGA-D3-A3ML-06,chr19,51217544,51217544,C,T,SHANK1,Missense_Mutation,0.2,,p.G179R,1


In [10]:
important_effects = [
    'Translation_Start_Site',
    'Frame_Shift_Ins',
    'Frame_Shift_Del',
    'Nonsense_Mutation',
    'Nonstop_Mutation']

# Important:
mutation_table_imp = mutation_table.loc[mutation_table.Effect.isin(important_effects)]
print(mutation_table_imp.shape)

# Less Important:
# In_Frame_Ins
# In_Frame_Del
# Missense_Mutation
# Splice_Site
mutation_table_less_imp = mutation_table.loc[~mutation_table.Effect.isin(important_effects)]
print(mutation_table_less_imp.shape)

(5494, 12)
(77466, 12)


In [0]:
def make_xy(table, groups):
    mg = table \
        .groupby(groups) \
        .size() \
        .reset_index(name='Count') \
        .sort_values(['Count'], ascending=False)
    print('Most mutaed genes within {}:\n'.format(groups))
    print(mg.head(10))
    
    # Join groups: Gene_AADACL3|chr1|Translation_Start_Site
    tmp = pd.DataFrame()
    tmp['Sample'] = table['Sample']
    tmp['Gene'] = table[groups].apply(lambda x: '|'.join(x).strip(), axis=1)
    
    print('\nUnique features: %d' % len(tmp['Gene'].unique()))
    
    # Extract features
    tmp = cat_encode(tmp, tmp, 'Gene')
    tmp = tmp.loc[:, tmp.columns != 'Gene']
    
    # Group by Sample
    tmp = tmp.groupby('Sample', as_index=False).sum()
 
    # Join targets
    tmp = tmp.merge(pmt, how='inner', on='Sample')
    
    y = tmp[target_variable]
    print('Target: ', y.shape)
    
    tmp = tmp.loc[:, tmp.columns != 'Sample']
    tmp = tmp.loc[:, tmp.columns != target_variable]
    x = tmp
    print('Features: ', x.shape)
    
    return (x, y)

#### LGBM

In [12]:
# Important
x, y = make_xy(mutation_table_imp, ['Gene', 'Effect'])

Most mutaed genes within ['Gene', 'Effect']:

        Gene             Effect  Count
3332     TTN  Nonsense_Mutation     29
891    DNAH5  Nonsense_Mutation     18
1934   MUC16  Nonsense_Mutation     16
1862    MGAM  Nonsense_Mutation     12
893    DNAH8  Nonsense_Mutation     11
2035     NF1  Nonsense_Mutation     11
1744   LRP1B  Nonsense_Mutation     11
2019     NEB  Nonsense_Mutation     10
3265  TRANK1  Nonsense_Mutation      9
156     ANK3  Nonsense_Mutation      9

Unique features: 3659
Target:  (98,)
Features:  (98, 3659)


In [0]:
# Less Important
x, y = make_xy(mutation_table_less_imp, ['Gene', 'Effect'])

In [0]:
model = LGBMClassifier(boosting_type='goss')
model.fit(x, y)
model.feature_importances_