In [1]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from multiprocessing import Pool, cpu_count
import functools

# Parse pathway features

In [2]:
entrez2symbol = {}
with open('data/Homo_sapiens.gene_info') as f:
    for line in f.read().splitlines():
        row = line.split('\t')
        entrez2symbol[row[1]]=row[2]

These are the pathways related to cancer hallmarks, which will be used as edge features.  
Referance: http://www.cell.com/abstract/S0092-8674(11)00127-9

In [3]:
gene2pathway = {}
pathways = set()
with open('data/hallmarks.txt') as f:
    for line in f.read().splitlines():
        row = line.split('\t')
        if len(row) > 1:
            pathway = row[0].split('|')[1]
            pathways.add(pathway)
            for entrez in row[2:]:
                if entrez not in entrez2symbol:
                    continue
                gene = entrez2symbol[entrez]
                if gene not in gene2pathway:
                    gene2pathway[gene]=set()
                gene2pathway[gene].add(pathway)

# Parse TCGA BRCA subtypes

In [10]:
pat2subtype = {}
def parse_subtypes(fn, pat2subtype):
    with open(fn) as f:
        for line in f.read().rstrip().splitlines()[1:]:
            row = line.split("\t")
            pat = row[0][:12]
            if len(row[0])>12:
                tissue_code = row[0][13:15]
                if int(tissue_code) >= 10:
                    continue
            # Remove "Normal" subtype
            if row[1] in ['NA', 'Normal']:
                continue
            pat2subtype[pat] = row[1]
        return pat2subtype
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_nature547.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_nature522.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_cell871.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_cell817.txt', pat2subtype)

# Parse mutation and CNA profile

Genes were classified as wild type (0) or altered (1) in each of the tumors with alterations defined as follows:
1. Most oncogenes (e.g., EGFR) were considered altered (activated) if impacted by a missense mutation, in-frame indel or copy number amplification. 
2. For the subset of oncogenes typically altered only by amplification (CCND1, LMO1, MDM2, MDM4, MYC, MYCL, MYCN, NCOA3, NKX2-1 and SKP2), only copy number amplifications were considered as alterations and not SNVs or indels. 
3. All other genes including tumor suppressors (e.g., CDKN2A) were considered altered (inactivated) if there was any type of non-silent mutation or a copy number deletion.

In [12]:
def parse_maf(fn, oncogene_tsg):
    df = pd.read_table(fn, low_memory=False)
    df = df.loc[(df.loc[:,'is_flank']==0) & (df.loc[:,'is_silent']==0),:]
    df['pat'] = df.loc[:,'Tumor_Sample_Barcode'].str[:12]
    
    filter_rows = []
    genes = set()
    for index, row in df.iterrows():
        gene = row['Hugo_Symbol']
        genes.add(gene)
        VC = row['Variant_Classification']
        if gene in oncogene_tsg:
            if oncogene_tsg[gene] in ['Oncogene']:
                if VC not in ['Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'De_novo_Start_InFrame']:
                    filter_rows.append(index)
            if oncogene_tsg[gene] == 'Amplification_Oncogene':
                filter_rows.append(index)
    df = df.drop(filter_rows)
    
    df = df.loc[:,['pat','Hugo_Symbol']]
    df['counter'] = 1
    df.set_index(['pat','Hugo_Symbol'], inplace=True)
    df = df.counter.groupby(level=[0,1]).min().unstack()
    df.fillna(0, inplace=True)
    return df, genes

In [13]:
oncogene_tsg={}
with open('data/oncogene_tsg.txt') as f:
    for line in f.read().splitlines():
        row = line.split("\t")
        oncogene_tsg[row[0]] = row[1]

In [14]:
fn = '/cellar/data/users/wzhang1984/Firehose/Firehose__2016_01_28/analyses/BRCA/Mutation_Assessor/BRCA-TP.maf.annotated'
df_mut, genes = parse_maf(fn, oncogene_tsg)

In [15]:
def parse_CNA(fn, genes):
    df = pd.read_table(fn,low_memory=False,index_col=0)
    df = df[df.index.isin(genes)]
    df = df.iloc[:,2:]
    df = (df/2.).round(0)
    df.columns = df.columns.str[:12]
    
    nonOncogene_rows = []
    for index, row in df.iterrows():
        gene = index
        if not (gene in oncogene_tsg and oncogene_tsg[gene] in ['Oncogene', 'Amplification_Oncogene']):
            nonOncogene_rows.append(index)
    df.loc[nonOncogene_rows,:] = df.loc[nonOncogene_rows,:] * (-1)
    df = df.clip(lower=0)
    return df

In [16]:
coding_genes = set()
with open("/cellar/data/users/wzhang1984/bcbio/genomes/Hsapiens/GRCh37/rnaseq-2014-07-14/ref-transcripts.gtf") as f:
    for line in f.read().splitlines():
        row = line.split("\t")
        if row[1] != "protein_coding" or row[2] != "transcript":
            continue
        gene_name = row[-1].split('gene_name "')[1].split('"')[0]
        if gene_name:
            coding_genes.add(gene_name)

In [17]:
coding_genes = coding_genes | genes

In [18]:
fn = '/cellar/data/users/wzhang1984/Firehose/Firehose__2016_01_28/analyses/BRCA/CopyNumber_Gistic2/all_thresholded.by_genes.txt'
df_CNA = parse_CNA(fn, coding_genes)

In [19]:
df_mut_CNA = pd.concat([df_mut, df_CNA.transpose()], axis=1,
                       join='inner').transpose().groupby(level=0).sum().clip(upper=1.).transpose()
df_mut_CNA = df_mut_CNA.loc[set(df_mut.index) & set(df_CNA.columns) & set(pat2subtype.keys()),df_CNA.index]

# Seperating training , validation and testing samples

In [20]:
training_set = df_mut_CNA.sample(frac=2./3)
training_set.sort_index(inplace=True)

In [21]:
validation_set = df_mut_CNA.drop(training_set.index).sample(frac=1./2)
validation_set.sort_index(inplace=True)

In [22]:
testing_set = df_mut_CNA.drop(training_set.index).drop(validation_set.index)
testing_set.sort_index(inplace=True)

# Or load existing datasets

In [20]:
training_set = pd.read_table('data/BRCA_training_data.txt', index_col=0)
validation_set = pd.read_table('data/BRCA_validation_data.txt', index_col=0)
testing_set = pd.read_table('data/BRCA_testing_data.txt', index_col=0)

# Parse PathwayCommons

In [23]:
PathwayCommons = pd.read_table('/cellar/users/wzhang1984/Data/PathwayCommons/PathwayCommons9.All.hgnc.txt')

In [24]:
PathwayCommons = PathwayCommons.loc[PathwayCommons.loc[:,'INTERACTION_TYPE'].isin(['controls-state-change-of',
                                                                                   'controls-transport-of',
                                                                                   'controls-phosphorylation-of',
                                                                                   'controls-expression-of',
                                                                                   'catalysis-precedes',
                                                                                   'in-complex-with',
                                                                                   'interacts-with', 
                                                                                   'neighbor-of']),:]

In [25]:
def parse_edge_features(mutrates, df):
    edge2features = {}
    features = set()
    for index, row in df.iterrows():
        ty = row['INTERACTION_TYPE']
        ty_d = ty + '_d'
        sources = row['INTERACTION_DATA_SOURCE'].split(';')
        g1 = row['PARTICIPANT_A']
        g2 = row['PARTICIPANT_B']
        edge = g1 + '\t' + g2
        edge_rev = g2 + '\t' + g1
        if edge not in edge2features:
            edge2features[edge] = {}
        if edge_rev not in edge2features:
            edge2features[edge_rev] = {}

        # Parse edge type features
        edge2features[edge][ty] = 1.
        edge2features[edge_rev][ty] = 1.
        features.add(ty)
        if ty not in ['in-complex-with','interacts-with','neighbor-of']:
            edge2features[edge][ty_d] = 1.
            features.add(ty_d)

        # Parse edge source features
        for source in sources:
            edge2features[edge][source] = 1.
            edge2features[edge_rev][source] = 1.
            features.add(source)

        # Parse pathway features. 
        # If one node is in the pathway, the score is 0.5
        # If both nodes are in the pathway, the score is 1
        if g1 in gene2pathway or g2 in gene2pathway:
            for pathway in pathways:
                edge2features[edge][pathway] = 0.
                edge2features[edge_rev][pathway] = 0.
                features.add(pathway)
            for g in [g1, g2]:
                if g in gene2pathway:
                    for pathway in gene2pathway[g]:
                        edge2features[edge][pathway] += 0.5
                        edge2features[edge_rev][pathway] += 0.5

        # Parse mutation features from the training set
        # Calculate mutation rates
        mutrate_g1 = 0
        mutrate_g2 = 0
        if g1 in training_set:
            mutrate_g1 = mutrates.loc[g1]
        if g2 in training_set:
            mutrate_g2 = mutrates.loc[g2]
        edge2features[edge]['mutrate_source'] = mutrate_g1
        edge2features[edge]['mutrate_target'] = mutrate_g2
        edge2features[edge_rev]['mutrate_source'] = mutrate_g2
        edge2features[edge_rev]['mutrate_target'] = mutrate_g1
        features.add('mutrate_source')
        features.add('mutrate_target')

        # Calculate mutual exclusivity / co-occurrence
        ME = 0.
        if g1 in training_set and g2 in training_set:
            if training_set.loc[:,g1].sum() < 7 or training_set.loc[:,g2].sum() < 7:
                continue
            tab = pd.crosstab(training_set.loc[:,g1],training_set.loc[:,g2])
            if tab.shape != (2, 2):
                continue
            if tab.iloc[1,1] * tab.iloc[0,0] >= tab.iloc[0,1] * tab.iloc[1,0]:
                continue
            oddsratio, pvalue = fisher_exact(tab, alternative='less')
            logp = np.log10(pvalue)
            ME = -logp
            if pvalue < 0.05:
                print g1, g2, oddsratio, pvalue
        edge2features[edge]['mutual_exclusive'] = ME
        edge2features[edge_rev]['mutual_exclusive'] = ME
        features.add('mutual_exclusive')

    return edge2features, features

In [26]:
training_set_mutrate = training_set.sum() / training_set.shape[0]

n_processes = cpu_count()
pool = Pool(processes=n_processes)

df_split = np.array_split(PathwayCommons, n_processes, axis=0)
parse_edge_features_partial = functools.partial(parse_edge_features, training_set_mutrate)
edge2features_list = pool.map(parse_edge_features_partial, df_split)

pool.close()
pool.join()

CUL7 TP53 0.0 0.0471120433161
CCND1 RB1 0.236645299145 0.0186571242676
CCND1 RB1 0.236645299145 0.0186571242676
CCND1 RB1 0.236645299145 0.0186571242676
FAS PIK3CA 0.0 0.0413151922752
PTEN CCND1 0.376031263569 0.0354478911648
PTEN PIK3CA 0.482412060302 0.0188172715341
PTEN PIK3CA 0.482412060302 0.0188172715341
MAP3K1 TP53 0.501903284929 0.0352992292863
ERBB2 PTEN 0.206816059757 0.00814980642656
PIK3CA N4BP2L1 0.170952380952 0.0488316310707
PIK3CA PTEN 0.482412060302 0.0188172715341
PIK3CA RB1 0.469214108911 0.0310205777451
PIK3CA TSC22D1 0.262053735738 0.0476015628313
PIK3R1 CCND1 0.0 0.0495757098996
TP53 NUP93 0.0 0.0471120433161


In [27]:
edge2features_union = {}
features_union = set()
for i in range(len(edge2features_list)):
    edge2features_union.update(edge2features_list[i][0])
    features_union |= edge2features_list[i][1]

In [28]:
features_sorted = sorted(features_union)
edge2features_line_out = ''
for gene_pair in sorted(edge2features_union):
    edge2features_line_out += gene_pair
    for feature in features_sorted:
        if feature in edge2features_union[gene_pair]:
            edge2features_line_out += '\t{}'.format(edge2features_union[gene_pair][feature])
        else:
            edge2features_line_out += '\t0.0'
    edge2features_line_out += '\n'

# Output dataframes to files

In [29]:
training_set.to_csv('data/BRCA_training_data.txt', sep='\t')
validation_set.to_csv('data/BRCA_validation_data.txt', sep='\t')
testing_set.to_csv('data/BRCA_testing_data.txt', sep='\t')

In [30]:
training_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_training_lables.txt', sep='\t')
validation_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_validation_lables.txt', sep='\t')
testing_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_testing_lables.txt', sep='\t')

In [31]:
with open('data/BRCA_edge2features.txt', 'w') as f:
    f.write(edge2features_line_out)

In [32]:
with open('data/BRCA_feature_names.txt', 'w') as f:
    f.write('\n'.join(features_sorted))

## Select features (optional)

In [40]:
features_sig = set()
with open('data/BRCA_edge_feature_weights_9.txt') as f:
    for line in f.read().rstrip().splitlines()[1:]:
        w = line.split('\t')
        if float(w[1]) > 0.9:
            features_sig.add(w[0])

In [41]:
features_sorted = sorted(features_union)
edge2features_line_out = ''
for gene_pair in sorted(edge2features_union):
    edge2features_line_out += gene_pair
    for feature in features_sorted:
        if feature not in features_sig:
            continue
        if feature in edge2features_union[gene_pair]:
            edge2features_line_out += '\t{}'.format(edge2features_union[gene_pair][feature])
        else:
            edge2features_line_out += '\t0.0'
    edge2features_line_out += '\n'

In [42]:
with open('data/BRCA_edge2features_logistic.txt', 'w') as f:
    f.write(edge2features_line_out)
with open('data/BRCA_feature_names_logistic.txt', 'w') as f:
     f.write('\n'.join(sorted(set(features_sorted) & features_sig)))

# Scratch

In [143]:
df_edge2features.loc['SLC38A10\tGABPB2',df_edge2features.loc['SLC38A10\tGABPB2',:]>0]

gene_1                     SLC38A10
gene_2                       GABPB2
MSigDB                            1
controls-expression-of            1
mutrate_source            0.0201342
mutrate_target            0.0033557
Name: SLC38A10\tGABPB2, dtype: object