# Initialize miRNA Storage Structures

Storing expression data in .csv files is great for portability but has the disadvantage of being very slow to read into memory.  Here I am going through the most recent Firehose run, reading in the expression data for each cancer, and saving each file in HDF5 format.  This should make working with this data with Pandas much easier.

In [3]:
import os as os
import pandas as pd

In [4]:
import Data.Firehose as FH

In [5]:
#path = '/cellar/users/agross/TCGA_Code/TCGA/Data/Firehose__2014_07_15/'
path = '/cellar/users/agross/TCGA_Code/CancerData/Data/Firehose__2015_04_02/'

In [6]:
store = pd.HDFStore('/data_ssd/miRNASeq_2015_04_02.h5')

In [7]:
rna = {}
for c in os.listdir(path + 'stddata'):
    try:
        rna[c] = FH.read_miRNASeq(path, c, tissue_code='All')
        if c not in store.keys():
            store.append(c, rna[c])
            store.create_table_index(c, optlevel=9, kind='full')
    except:
        print c
rna_df = pd.concat(rna.values(), axis=1)

cols = list(rna_df.columns)
pts = [c for c in cols if (c[0],'01') in cols and (c[0], '11') in 
       cols and c[1] in ['01','11']]
matched_tn = rna_df[pts]
matched_tn = matched_tn.groupby(axis=1, level=[0,1]).first()

In [8]:
codes = pd.concat(rna, axis=1).columns
codes = pd.Series(codes.get_level_values(0), codes.get_level_values(1))
codes = codes[codes.isin(['KIPAN','GBMLGG','COADREAD']) == False]
codes = codes.groupby(level=0).first()
codes.name = 'codes'

In [9]:
matched_tn.shape

(1071, 1284)

In [10]:
store['codes'] = codes

In [11]:
matched_tn.to_hdf(store, 'matched_tn')

In [12]:
store.close()