-
Notifications
You must be signed in to change notification settings - Fork 13
/
Firehose.py
278 lines (240 loc) · 11.1 KB
/
Firehose.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Created on Sep 4, 2012
Set of functions to read in data that has been formated in the
BROAD GDAC Firehose data processing pipeline.
Nothing in here should depend on any other modules.
I am relying heavily on pandas for this project, so the main
goal of these functions is get data from the Firehose's tables
into Pandas data-structures that I can work with.
There is a little bit of pre-processing that is done to get the
files from Firehose into the local file-system in a reasonably
organized hierarchy, that is done at the time of data download
and in the run's initialization. These dependencies should
eventually be circumvented.
@author: agross
"""
import os as os
import numpy as np
import pandas as pd
def fix_barcode_columns(df, patients=None, tissue_code='All', get_batch=False):
"""
Takes TCGA barcode and reformats it into a MultiIndex if all tissue_codes
are desired, or just pulls the correct tissue codes and filteres the
DataFrame.
df: pandas DataFrame
patients: patient list to filter on
tissue_code: ['01','11','All'] #if all returns MultiIndex
"""
if get_batch is False:
df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15]) for i
in df.columns])
else:
df.columns = pd.MultiIndex.from_tuples([(i[:12], i[13:15], i[21:24]) for i
in df.columns])
if patients is not None:
df = df.ix[:, patients]
if tissue_code != 'All':
try:
df = df.T.xs(tissue_code, level=1).T # pandas bug
df = df.groupby(axis=1, level=0).first()
except KeyError: # try different cross-seciton
new_code = pd.value_counts(df.columns.get_level_values(1)).idxmax()
df = df.T.xs(new_code, level=1).T # pandas bug
df = df.groupby(axis=1, level=0).first()
else:
df = df.groupby(axis=1, level=[0, 1]).first()
return df
def get_dataset_path(data_path, cancer, data_type, ext):
"""
This is a helper to get paths to a particular data-set.
In processing the data, we develop relatively complicated file hierarchies
to not have a ton of folders at the top level and make it easier to get
to files manually. This makes it a little tough to track down files
automatically so this function fills that role.
data_type: the top-level data-type of the file (i.e. rnaseqv2,
transcriptome,...)
ext: the file you are looking for (i.e. RSEM_genes_normalized,
junction_quantification, ...)
"""
stddata_path = data_path + 'stddata/' + cancer
if not os.path.isdir(stddata_path):
return
data_types = filter(lambda g: g.startswith(data_type),
os.listdir(stddata_path))
if data_type in data_types: # get the paths
paths = [f[0] for f in list(os.walk(stddata_path + '/' + data_type)) if
(ext + '/data') in f[0]]
else:
return
f = [path + '/' + f for path in paths for f in os.listdir(path)
if 'data' in f] # pull the data file
return f
def get_mutation_matrix(data_path, cancer, tissue_code='01'):
"""
Get gene by patient mutation matrix.
Here I filter by the is_silent column in the MAF file,
so I am returning only non-silent mutations.
"""
path = '{}/analyses/{}/Mutation_Assessor/'.format(data_path, cancer)
f = [f for f in os.listdir(path) if f.endswith('.maf.annotated')][0]
maf = pd.read_table(path + f, low_memory=False)
maf = maf.dropna(how='all', axis=[0, 1])
maf = maf.set_index(['Hugo_Symbol', 'Tumor_Sample_Barcode'])
non_silent = maf[maf.is_silent == 0]
non_silent['counter'] = 1
hit_matrix = non_silent.counter.groupby(level=[0, 1]).sum().unstack()
hit_matrix = fix_barcode_columns(hit_matrix, tissue_code=tissue_code)
return hit_matrix
def get_submaf(data_path, cancer, genes='All', fields='basic'):
"""
Pull a sub-section of the MAF file for analysis.
genes: list of genes for which to return data
fields: ['basic', 'all']: if basic, returns reduced version of MAF
"""
path = '{}/analyses/{}/Mutation_Assessor/'.format(data_path, cancer)
f = [f for f in os.listdir(path) if f.endswith('.maf.annotated')][0]
maf = pd.read_table(path + f, low_memory=False)
maf = maf.dropna(how='all', axis=[0, 1])
maf['Tissue_Type'] = maf.Tumor_Sample_Barcode.map(lambda s: s[13:15])
maf.Tumor_Sample_Barcode = maf.Tumor_Sample_Barcode.map(lambda s: s[:12])
if genes != 'All':
maf = maf[maf.Hugo_Symbol.isin(genes)]
def get_allele(s):
alleles = [s['Tumor_Seq_Allele1'], s['Tumor_Seq_Allele2']]
return [a for a in alleles if a != s['Reference_Allele']][0]
maf['Alt_Allele'] = maf.apply(get_allele, 1)
if fields == 'basic':
maf = maf[['Hugo_Symbol', 'NCBI_Build', 'Chromosome', 'Start_position',
'End_position', 'Strand', 'Reference_Allele',
'Alt_Allele', 'Tumor_Sample_Barcode']]
maf = maf.set_index('Hugo_Symbol', append=True)
maf.index = maf.index.swaplevel(0, 1)
return maf
def get_gistic_gene_matrix(data_path, cancer, tissue_code='01'):
"""
Reads in gene by patient copy-number alteration matrix.
Index is MultiIndex with ['Cytoband', 'Locus ID', 'Gene Symbol']
on the levels.
"""
path = '{}/analyses/{}/CopyNumber_Gistic2/'.format(data_path, cancer)
gistic = pd.read_table(path + 'all_thresholded.by_genes.txt',
index_col=[2, 1, 0],
low_memory=False)
gistic = fix_barcode_columns(gistic, tissue_code=tissue_code)
return gistic
def get_gistic_arm_values(data_path, cancer, tissue_code='01'):
"""
Reads in arm by patient copy-number alteration matrix.
"""
path = '{}/analyses/{}/CopyNumber_Gistic2/'.format(data_path, cancer)
gistic = pd.read_table(path + 'broad_values_by_arm.txt', index_col=0,
low_memory=False)
gistic = fix_barcode_columns(gistic, tissue_code=tissue_code)
return gistic
def get_gistic_lesions(data_path, cancer, patients=None, tissue_code='01'):
"""
Reads in lesion by patient CNA matrix.
Returns thresholded calls as made by GISTIC2 in the Firehose pipeline.
"""
path = '{}/analyses/{}/CopyNumber_Gistic2/'.format(data_path, cancer)
gistic = pd.read_table(path + 'all_lesions.conf_99.txt', index_col=[0, 1],
low_memory=False)
lesions = gistic.select(lambda s: 'TCGA' in s, axis=1)
lesions = lesions.select(lambda s: 'values' not in s[0], axis=0)
from_tuples = pd.MultiIndex.from_tuples
lesions.index = from_tuples([(s[0].split(' ')[0], s[1].strip(), 'Lesion')
for s in lesions.index])
lesions = lesions.groupby(level=[0, 1, 2]).first()
if 'Deletion' in lesions.index.get_level_values(0):
lesions.T['Deletion'] = (lesions.T['Deletion'] * -1).replace(-0, 0)
lesions = fix_barcode_columns(lesions, patients, tissue_code)
return lesions
def get_gistic(cancer_name, data_path, filter_with_rna=True,
collapse_on_bands=True, min_patients=5):
lesions = get_gistic_lesions(cancer_name, data_path)
return lesions
def read_rppa(data_path, cancer, patients=None, tissue_code='01'):
"""
Reads in antibody by patient reverse-phase protein array matrix.
Index is MultiIndex with ['protien','antibody'] on the levels.
"""
path = '{}/stddata/{}/RPPA_AnnotateWithGene/'.format(data_path, cancer)
rppa = pd.read_table(path + cancer + '.rppa.txt', index_col=0,
low_memory=False)
rppa['protien'] = rppa.index.map(lambda s: s.split('|')[0])
rppa['antibody'] = rppa.index.map(lambda s: s.split('|')[1])
rppa = rppa.set_index(['protien', 'antibody'])
rppa = fix_barcode_columns(rppa, tissue_code=tissue_code)
return rppa
def read_rnaSeq(data_path, cancer, patients=None, average_on_genes=True,
tissue_code='01', get_batch=False):
"""
Reads in gene by patient rnaSeq mRNA expression matrix.
Data is log-transformed and a lower bound of -3 (1/8 read per million)
is set.
"""
files = get_dataset_path(data_path, cancer, 'rnaseqv2',
'RSEM_genes_normalized')
if files is None:
return
rnaSeq = pd.concat([pd.read_table(f, index_col=0, skiprows=[1],
low_memory=False)
for f in files])
rnaSeq = np.log2(rnaSeq).replace(-np.inf, -3.) # close enough to 0
if average_on_genes: # Pretty much all duplicates are unknown ('?')
rnaSeq = rnaSeq.groupby(by=lambda n: n.split('|')[0]).mean()
rnaSeq = fix_barcode_columns(rnaSeq, patients, tissue_code, get_batch)
return rnaSeq
def read_rnaSeq_splice_junctions(data_path, cancer, patients=None,
tissue_code='01'):
"""
Reads in gene by patient rnaSeq mRNA splice junction matrix.
Values are raw counts.
"""
files = get_dataset_path(data_path, cancer, 'rnaseqv2',
'junction_quantification')
if files is None:
return
rnaSeq = pd.concat([pd.read_table(f, index_col=0, skiprows=[1])
for f in files])
rnaSeq = fix_barcode_columns(rnaSeq, patients, tissue_code)
return rnaSeq
def read_mrna(data_path, cancer, patients=None, tissue_code='01'):
"""
Reads in gene by patient microarray gene expression.
"""
files = get_dataset_path(data_path, cancer, 'transcriptome',
'unc_lowess_normalization_gene_level')
if files is None:
return
mrna = pd.concat([pd.read_table(f, index_col=0, skiprows=[1],
na_values=['null'],
low_memory=False)
for f in files])
mrna = fix_barcode_columns(mrna, patients, tissue_code)
return mrna
def read_miRNASeq(data_path, cancer, patients=None, tissue_code='01'):
"""
Reads in miRNA by patient miRNASeq matrix.
Data is log-transformed and a lower bound of -3 (1/8 read per million)
is set.
They often use both HiSeq and GA2 for this, so I'm merging the two,
not entirely sure if that's Kosher.
"""
stddata_path = data_path + 'stddata/' + cancer
paths = [f[0] for f in list(os.walk(stddata_path + '/mirnaseq'))
if 'miR_gene_expression/data' in f[0]]
data = []
for path in paths:
f = [f for f in os.listdir(path) if 'data' in f][0]
mirna = pd.read_table(path + '/' + f, index_col=0, header=None,
low_memory=False)
data.append(mirna)
mirna = pd.concat(data, axis=1)
mirna = mirna.T.set_index(['miRNA_ID', 'Hybridization REF'])
mirna = mirna.sortlevel(level=0).ix['reads_per_million_miRNA_mapped']
mirna = np.log2(mirna.astype(float)).replace(-np.inf, -3.) # close to 0
mirna = mirna.T
mirna = mirna.groupby(level=0, axis=1).last() # Use the HiSeq data over GA2
mirna = fix_barcode_columns(mirna, patients, tissue_code)
return mirna