In [None]:
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
import os

In [None]:
# This downloads a 753 MB mutations gzip file.  This will take about 1-5 mins depending on your
# connection speed.
filename = "pancancer_mutations.maf.gz"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading mutation data. 753 MB ...')
    url = 'http://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc'  
    urllib.request.urlretrieve(url, './pancancer_mutations.maf.gz')  
print("done.")

In [None]:
# This downloads an 18 MB clinical data file
filename = "pancancer_clinical.txt"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading clinical data ...')
    url = 'http://api.gdc.cancer.gov/data/0fc78496-818b-4896-bd83-52db1f533c5c'  
    urllib.request.urlretrieve(url, './pancancer_clinical.txt')  
print("done.")

In [None]:
# Load the mutations dataframe
print('Loading mutations dataframe ...')
mutations = pd.read_csv("pancancer_mutations.maf.gz", compression='gzip',
                        sep='\t',
                        usecols=['Tumor_Sample_Barcode','Hugo_Symbol', 
                                 'Variant_Classification', 'Variant_Type',
                                'Chromosome', 'Start_Position',
                                 'End_Position', 'Strand', 'CENTERS'])
print("done.")
print("Mutations count", mutations.Tumor_Sample_Barcode.count())
display(mutations.head())

In [None]:
mutations.head()

In [None]:
# Set mutations index
mutations['row'] = np.arange(len(mutations))
mutations.set_index('row')

In [None]:
# Load the clinical data
print('Loading clinical dataframe ...')
clinical = pd.read_csv("pancancer_clinical.txt", sep='\t',
                        usecols=['bcr_patient_uuid','bcr_patient_barcode', 'gender', 
                                 'vital_status', 'days_to_birth', 'days_to_death', 
                                 'age_at_initial_pathologic_diagnosis', 'pathologic_stage', 
                                 'height', 'weight'])

print('Clinical count', clinical.bcr_patient_uuid.count())
display(clinical.head())

In [None]:
# Read the cancertypes.  These are the matching labels for the mutations file
print("Loading labels dataframe ...")
labels = pd.read_csv("labels.txt", sep='\t')
labels.columns = ["cancer_type"]
labels['row'] = np.arange(len(labels))
labels.set_index('row')
print('Labels count', labels.cancer_type.count())

In [None]:
# Merge the labels with the mutations
mutations = mutations.merge(labels, left_on='row', right_on='row')
display(mutations.head())

In [None]:
# Get the patient barcode.  This is what we will use to join the mutations to the clinical data
def parsePatientBarcode(tumor_sample_barcode):
        return tumor_sample_barcode[0:12]

mutations['bcr_patient_barcode'] = mutations['Tumor_Sample_Barcode'].apply(parsePatientBarcode)

In [None]:
merged = mutations.merge(clinical, left_on='bcr_patient_barcode', right_on='bcr_patient_barcode')
display(merged.head())
merged.Tumor_Sample_Barcode.count()
print('Merged mutations count', merged.Tumor_Sample_Barcode.count())

In [None]:
# Write out new merged dataframe as csv file
print("Writing new csv file with merged mutations, clinical, and cancer types ...")
merged.to_csv("pancancer_mutations_merged.csv")
print("done.")