In [1]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from multiprocessing import Pool, cpu_count
import functools

# Parse pathway features

In [2]:
entrez2symbol = {}
with open('data/Homo_sapiens.gene_info') as f:
    for line in f.read().splitlines():
        row = line.split('\t')
        entrez2symbol[row[1]]=row[2]

These are the pathways related to cancer hallmarks, which will be used as edge features.  
Referance: http://www.cell.com/abstract/S0092-8674(11)00127-9

In [3]:
gene2pathway = {}
pathways = set()
with open('data/hallmarks.txt') as f:
    for line in f.read().splitlines():
        row = line.split('\t')
        if len(row) > 1:
            pathway = row[0].split('|')[1]
            pathways.add(pathway)
            for entrez in row[2:]:
                if entrez not in entrez2symbol:
                    continue
                gene = entrez2symbol[entrez]
                if gene not in gene2pathway:
                    gene2pathway[gene]=set()
                gene2pathway[gene].add(pathway)

# Parse TCGA BRCA subtypes

In [4]:
pat2subtype = {}
def parse_subtypes(fn, pat2subtype):
    with open(fn) as f:
        for line in f.read().rstrip().splitlines()[1:]:
            row = line.split("\t")
            pat = row[0][:12]
            if len(row[0])>12:
                tissue_code = row[0][13:15]
                if int(tissue_code) >= 10:
                    continue
            if row[1] == 'NA':
                continue
            pat2subtype[pat] = row[1]
        return pat2subtype
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_nature547.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_nature522.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_cell871.txt', pat2subtype)
pat2subtype = parse_subtypes('data/TCGABRCA2PAM50_cell817.txt', pat2subtype)

# Parse mutation and CNA profile

Genes were classified as wild type (0) or altered (1) in each of the tumors with alterations defined as follows:
1. Most oncogenes (e.g., EGFR) were considered altered (activated) if impacted by a missense mutation, in-frame indel or copy number amplification. 
2. For the subset of oncogenes typically altered only by amplification (CCND1, LMO1, MDM2, MDM4, MYC, MYCL, MYCN, NCOA3, NKX2-1 and SKP2), only copy number amplifications were considered as alterations and not SNVs or indels. 
3. All other genes including tumor suppressors (e.g., CDKN2A) were considered altered (inactivated) if there was any type of non-silent mutation or a copy number deletion.

In [5]:
def parse_maf(fn, oncogene_tsg):
    df = pd.read_table(fn, low_memory=False)
    df = df.loc[(df.loc[:,'is_flank']==0) & (df.loc[:,'is_silent']==0),:]
    df['pat'] = df.loc[:,'Tumor_Sample_Barcode'].str[:12]
    
    filter_rows = []
    genes = set()
    for index, row in df.iterrows():
        gene = row['Hugo_Symbol']
        genes.add(gene)
        VC = row['Variant_Classification']
        if gene in oncogene_tsg:
            if oncogene_tsg[gene] in ['Oncogene']:
                if VC not in ['Missense_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'De_novo_Start_InFrame']:
                    filter_rows.append(index)
            if oncogene_tsg[gene] == 'Amplification_Oncogene':
                filter_rows.append(index)
    df = df.drop(filter_rows)
    
    df = df.loc[:,['pat','Hugo_Symbol']]
    df['counter'] = 1
    df.set_index(['pat','Hugo_Symbol'], inplace=True)
    df = df.counter.groupby(level=[0,1]).min().unstack()
    df.fillna(0, inplace=True)
    return df, genes

In [6]:
oncogene_tsg={}
with open('data/oncogene_tsg.txt') as f:
    for line in f.read().splitlines():
        row = line.split("\t")
        oncogene_tsg[row[0]] = row[1]

In [7]:
fn = '/cellar/data/users/wzhang1984/Firehose/Firehose__2016_01_28/analyses/BRCA/Mutation_Assessor/BRCA-TP.maf.annotated'
df_mut, genes = parse_maf(fn, oncogene_tsg)

In [8]:
def parse_CNA(fn, genes):
    df = pd.read_table(fn,low_memory=False,index_col=0)
    df = df[df.index.isin(genes)]
    df = df.iloc[:,2:]
    df = (df/2.).round(0)
    df.columns = df.columns.str[:12]
    
    nonOncogene_rows = []
    for index, row in df.iterrows():
        gene = index
        if not (gene in oncogene_tsg and oncogene_tsg[gene] in ['Oncogene', 'Amplification_Oncogene']):
            nonOncogene_rows.append(index)
    df.loc[nonOncogene_rows,:] = df.loc[nonOncogene_rows,:] * (-1)
    df = df.clip(lower=0)
    return df

In [9]:
coding_genes = set()
with open("/cellar/data/users/wzhang1984/bcbio/genomes/Hsapiens/GRCh37/rnaseq-2014-07-14/ref-transcripts.gtf") as f:
    for line in f.read().splitlines():
        row = line.split("\t")
        if row[1] != "protein_coding" or row[2] != "transcript":
            continue
        gene_name = row[-1].split('gene_name "')[1].split('"')[0]
        if gene_name:
            coding_genes.add(gene_name)

In [10]:
coding_genes = coding_genes | genes

In [11]:
fn = '/cellar/data/users/wzhang1984/Firehose/Firehose__2016_01_28/analyses/BRCA/CopyNumber_Gistic2/all_thresholded.by_genes.txt'
df_CNA = parse_CNA(fn, coding_genes)

In [12]:
df_mut_CNA = pd.concat([df_mut, df_CNA.transpose()], axis=1,
                       join='inner').transpose().groupby(level=0).sum().clip(upper=1.).transpose()
df_mut_CNA = df_mut_CNA.loc[set(df_mut.index) & set(df_CNA.columns) & set(pat2subtype.keys()),df_CNA.index]

In [13]:
training_set = df_mut_CNA.sample(frac=1./3)
training_set.sort_index(inplace=True)

In [14]:
validation_set = df_mut_CNA.drop(training_set.index).sample(frac=1./2)
validation_set.sort_index(inplace=True)

In [15]:
testing_set = df_mut_CNA.drop(training_set.index).drop(validation_set.index)
testing_set.sort_index(inplace=True)

# Parse PathwayCommons

In [16]:
PathwayCommons = pd.read_table('/cellar/users/wzhang1984/Data/PathwayCommons/PathwayCommons9.All.hgnc.txt')

In [17]:
PathwayCommons = PathwayCommons.loc[PathwayCommons.loc[:,'INTERACTION_TYPE'].isin(['controls-state-change-of',
                                                                                   'controls-transport-of',
                                                                                   'controls-phosphorylation-of',
                                                                                   'controls-expression-of',
                                                                                   'catalysis-precedes',
                                                                                   'in-complex-with',
                                                                                   'interacts-with', 
                                                                                   'neighbor-of']),:]

In [18]:
def parse_edge_features(mutrates, df):
    edge2features = {}
    for index, row in df.iterrows():
        ty = row['INTERACTION_TYPE']
        ty_d = ty + '_directed'
        sources = row['INTERACTION_DATA_SOURCE'].split(';')
        g1 = row['PARTICIPANT_A']
        g2 = row['PARTICIPANT_B']
        edge = g1 + '\t' + g2
        edge_rev = g2 + '\t' + g1
        if edge not in edge2features:
            edge2features[edge] = {}
        if edge_rev not in edge2features:
            edge2features[edge_rev] = {}

        # Parse edge type features
        edge2features[edge][ty] = 1.
        edge2features[edge_rev][ty] = 1.
        if ty not in ['in-complex-with','interacts-with','neighbor-of']:
            edge2features[edge][ty_d] = 1.

        # Parse edge source features
        for source in sources:
            edge2features[edge][source] = 1.
            edge2features[edge_rev][source] = 1.

        # Parse pathway features. 
        # If one node is in the pathway, the score is 0.5
        # If both nodes are in the pathway, the score is 1
        if g1 in gene2pathway or g2 in gene2pathway:
            for pathway in pathways:
                edge2features[edge][pathway] = 0.
                edge2features[edge_rev][pathway] = 0.
            for g in [g1, g2]:
                if g in gene2pathway:
                    for pathway in gene2pathway[g]:
                        edge2features[edge][pathway] += 0.5
                        edge2features[edge_rev][pathway] += 0.5

        # Parse mutation features from the training set
        # Calculate mutation rates
        mutrate_g1 = 0
        mutrate_g2 = 0
        if g1 in training_set:
            mutrate_g1 = mutrates.loc[g1]
        if g2 in training_set:
            mutrate_g2 = mutrates.loc[g2]
        edge2features[edge]['mutrate_source'] = mutrate_g1
        edge2features[edge]['mutrate_target'] = mutrate_g2
        edge2features[edge_rev]['mutrate_source'] = mutrate_g2
        edge2features[edge_rev]['mutrate_target'] = mutrate_g1

        # Calculate mutual exclusivity / co-occurrence
        ME = 0.
        if g1 in training_set and g2 in training_set:
            if training_set.loc[:,g1].sum() < 7 or training_set.loc[:,g2].sum() < 7:
                continue
            tab = pd.crosstab(training_set.loc[:,g1],training_set.loc[:,g2])
            if tab.shape != (2, 2):
                continue
            if tab.iloc[1,1] * tab.iloc[0,0] >= tab.iloc[0,1] * tab.iloc[1,0]:
                continue
            oddsratio, pvalue = fisher_exact(tab, alternative='less')
            logp = np.log10(pvalue)
            ME = -logp
            if pvalue < 0.05:
                print g1, g2, oddsratio, pvalue
        edge2features[edge]['mutual_exclusive'] = ME
        edge2features[edge_rev]['mutual_exclusive'] = ME

    return pd.DataFrame.from_dict(edge2features, orient='index')

In [19]:
training_set_mutrate = training_set.sum() / training_set.shape[0]

n_processes = cpu_count()
pool = Pool(processes=n_processes)

df_split = np.array_split(PathwayCommons, n_processes, axis=0)
parse_edge_features_partial = functools.partial(parse_edge_features, training_set_mutrate)
df_edge2features = pool.map(parse_edge_features_partial, df_split)

pool.close()
pool.join()

HSD17B2 TP53 0.0 0.0461435378519
CCND1 RB1 0.0 0.0241218594707
CCND1 RB1 0.0 0.0241218594707
CCND1 RB1 0.0 0.0241218594707
MYC RUNX1 0.0 0.0318363234718
PIK3CA RB1 0.309260337798 0.040019576776


In [20]:
df_edge2features = pd.concat(df_edge2features)
df_edge2features.sort_index(inplace=True)
df_edge2features.sort_index(axis=1, inplace=True)
df_edge2features.fillna(0., inplace=True)
gene_pairs = df_edge2features.index.str.split('\t')
df_edge2features.insert(0, 'gene_2', gene_pairs.str[1])
df_edge2features.insert(0, 'gene_1', gene_pairs.str[0])

# Output dataframes to files

In [21]:
training_set.to_csv('data/BRCA_training_data.txt', sep='\t')
validation_set.to_csv('data/BRCA_validation_data.txt', sep='\t')
testing_set.to_csv('data/BRCA_testing_data.txt', sep='\t')

In [22]:
training_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_training_lables.txt', sep='\t')
validation_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_validation_lables.txt', sep='\t')
testing_set.index.to_series().map(pat2subtype).to_csv('data/BRCA_testing_lables.txt', sep='\t')

In [23]:
df_edge2features.to_csv('data/BRCA_edge2features.txt', sep='\t', index=False)

In [24]:
df_edge2features.columns[2:].to_series().to_csv('data/BRCA_feature_names.txt', sep='\t', index=False)