In [1]:
import pandas as pd
import requests
import subprocess
import io
import numpy as np
from oauth2client.client import GoogleCredentials

credentials = GoogleCredentials.get_application_default()
token = credentials.get_access_token().access_token
headers = {"Authorization" : "bearer {}".format(token)}

# Download from Terra
We download the outputs from running the Molecular Oncology Almanac and PHIAL on the Van Allen 2015 and Robison 2015 cohorts using the API.

In [2]:
class Download(object):
    credentials = GoogleCredentials.get_application_default()
    token = credentials.get_access_token().access_token
    headers = {"Authorization" : "bearer {}".format(token)}
    
    root = 'https://api.firecloud.org/api'
    
    @classmethod
    def request_data_model(cls, namespace, workspace, entity):
        request = '/'.join([cls.root, 'workspaces', namespace, workspace, 'entities', entity, 'tsv'])
        r = requests.get(request, headers=cls.headers)
        return pd.read_csv(io.BytesIO(r.content), encoding='utf-8', sep='\t')
    
    @staticmethod
    def download(series, target_folder):
        blobs = ' '.join(series.dropna().dropna('').tolist())
        cmd = ' '.join(['gsutil -m cp', blobs, target_folder])
        subprocess.call(cmd, shell=True)
    
    @staticmethod
    def mkdir(folder_name):
        cmd = ''.join(['mkdir -p ', folder_name])
        subprocess.call(cmd, shell=True)
        
    @staticmethod
    def split_array(series, n_per_split):
        return np.array_split(series, n_per_split)
    
    @classmethod
    def split_download(cls, series, n_per_split, target_folder):
        split_files = Download.split_array(series, n_per_split)
        for chunk in split_files:
            Download.download(chunk, target_folder)
            
    @staticmethod     
    def unpack_tar(tar, target_folder):
        cmd = ' '.join(['tar -xf', tar, '-C', target_folder])
        subprocess.call(cmd, shell=True)

In [3]:
root_path = '/Users/brendan/Github/moalmanac-paper/analyses/retrospective-cohorts'
namespace = 'vanallen-firecloud-nih'
workspace_path_pairs = [
    ('VanAllen2015_dev-MOVED', '2015-VanAllen/data'),
    ('Robinson2015_dev-MOVED', '2015-Robinson/data'),
    ('TCGA-KIRP-MOAlmanac', '2016-TCGA/data'),
    ('2014-Perry-MOAlmanac', '2014-Perry/data')
]

phial_column = 'phialScoredDetailed'
almanac_column = 'moalmanac_tarGz'

for workspace, path in workspace_path_pairs:
    datamodel = Download.request_data_model(namespace, workspace, 'pair')
    
    phial_folder = '{}/{}/phial'.format(root_path, path)
    almanac_folder = '{}/{}/almanac'.format(root_path, path)
    almanac_tars_folder = '{}/{}/almanac-tars'.format(root_path, path)
    Download.mkdir(path)
    Download.mkdir(phial_folder)
    Download.mkdir(almanac_folder)
    Download.mkdir(almanac_tars_folder)
    
    Download.split_download(datamodel.loc[:, phial_column], 14, phial_folder)
    Download.split_download(datamodel.loc[:, almanac_column], 14, almanac_tars_folder)
    
    for individual in datamodel.loc[:, 'participant']:
        individual_tar = '{}/{}.almanac.tar.gz'.format(almanac_tars_folder, individual)
        individual_folder = '{}/{}'.format(almanac_folder, individual)
        Download.mkdir(individual_folder)
        Download.unpack_tar(individual_tar, individual_folder)