In [12]:
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
import os

In [17]:
# This downloads a 753 MB mutations gzip file.  This will take about 1-5 mins depending on your
# connection speed.
filename = "pancancer_mutations.maf.gz"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading mutation data. 753 MB ...')
    url = 'http://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc'  
    urllib.request.urlretrieve(url, './pancancer_mutations.maf.gz')  
print("done.")

Skipping download, as file pancancer_mutations.maf.gz is present
done.


In [19]:
# This downloads an 18 MB clinical data file
filename = "pancancer_clinical.txt"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading clinical data ...')
    url = 'http://api.gdc.cancer.gov/data/0fc78496-818b-4896-bd83-52db1f533c5c'  
    urllib.request.urlretrieve(url, './pancancer_clinical.txt')  
print("done.")

Skipping download, as file pancancer_clinical.txt is present
done.


In [55]:
# Load the mutations dataframe
print('Loading mutations dataframe ...')
mutations = pd.read_csv("pancancer_mutations.maf.gz", compression='gzip',
                        sep='\t',
                        usecols=['Tumor_Sample_Barcode','Hugo_Symbol', 
                                 'Variant_Classification', 'Variant_Type',
                                'Chromosome', 'CENTERS'])
print("done.")
print("Mutations count", mutations.Tumor_Sample_Barcode.count())
display(mutations.head())

Loading mutations dataframe ...


  interactivity=interactivity, compiler=compiler, result=result)


done.
Mutations count 3600963


Unnamed: 0,Hugo_Symbol,Chromosome,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,CENTERS
0,TACC2,10,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
1,JAKMIP3,10,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
2,PANX3,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
3,SPI1,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,RADIA|MUSE
4,NAALAD2,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS


In [56]:
mutations.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,CENTERS
0,TACC2,10,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
1,JAKMIP3,10,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
2,PANX3,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
3,SPI1,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,RADIA|MUSE
4,NAALAD2,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS


In [57]:
# Set mutations index
mutations['row'] = np.arange(len(mutations))
mutations.set_index('row')

Unnamed: 0_level_0,Hugo_Symbol,Chromosome,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,CENTERS
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,TACC2,10,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
1,JAKMIP3,10,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
2,PANX3,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
3,SPI1,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,RADIA|MUSE
4,NAALAD2,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
5,FAT3,11,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS
6,MTERFD3,12,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
7,BTBD11,12,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS
8,NOS1,12,5'Flank,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|SOMATICSNIPER|MUSE|VARSCANS
9,SLC2A14,12,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,SOMATICSNIPER|RADIA|MUTECT|MUSE|VARSCANS


In [59]:
# Load the clinical data
print('Loading clinical dataframe ...')
clinical = pd.read_csv("pancancer_clinical.txt", sep='\t',
                        usecols=['bcr_patient_uuid','bcr_patient_barcode', 'gender', 
                                 'vital_status', 'days_to_birth', 'days_to_death', 
                                 'age_at_initial_pathologic_diagnosis', 'pathologic_stage', 
                                 'height', 'weight'])

print('Clinical count', clinical.bcr_patient_uuid.count())
display(clinical.head())

Loading clinical dataframe ...
Clinical count 10956


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,bcr_patient_uuid,bcr_patient_barcode,gender,vital_status,days_to_birth,days_to_death,age_at_initial_pathologic_diagnosis,pathologic_stage,height,weight
0,B3164F7B-C826-4E08-9EE6-8FF96D29B913,TCGA-OR-A5J1,MALE,Dead,-21496,1355.0,58,Stage II,,
1,8E7C2E31-D085-4B75-A970-162526DD07A0,TCGA-OR-A5J2,FEMALE,Dead,-16090,1677,44,Stage IV,,
2,DFD687BC-6E69-42F7-AF94-D17FC150D1A1,TCGA-OR-A5J3,FEMALE,Alive,-8624,[Not Applicable],23,Stage III,,
3,5F3E2974-F1DF-47A2-8A8A-29BB525EEEF6,TCGA-OR-A5J4,FEMALE,Dead,-8451,423,23,Stage IV,,
4,802DBD0D-EF07-4C91-AB8D-1DD39532E947,TCGA-OR-A5J5,MALE,Dead,-11171,365,30,Stage III,,


In [60]:
# Read the cancertypes.  These are the matching labels for the mutations file
print("Loading labels dataframe ...")
labels = pd.read_csv("labels.txt", sep='\t')
labels.columns = ["cancer_type"]
labels['row'] = np.arange(len(labels))
labels.set_index('row')
print('Labels count', labels.cancer_type.count())

Loading labels dataframe ...
Labels count 3600963


In [62]:
# Merge the labels with the mutations
mutations = mutations.merge(labels, left_on='row', right_on='row')
display(mutations.head())

Unnamed: 0,Hugo_Symbol,Chromosome,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,CENTERS,row,cancer_type
0,TACC2,10,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,0,GBM
1,JAKMIP3,10,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,1,GBM
2,PANX3,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,2,GBM
3,SPI1,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,RADIA|MUSE,3,GBM
4,NAALAD2,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,4,GBM


In [63]:
# Get the patient barcode.  This is what we will use to join the mutations to the clinical data
def parsePatientBarcode(tumor_sample_barcode):
        return tumor_sample_barcode[0:12]

mutations['bcr_patient_barcode'] = mutations['Tumor_Sample_Barcode'].apply(parsePatientBarcode)

In [64]:
merged = mutations.merge(clinical, left_on='bcr_patient_barcode', right_on='bcr_patient_barcode')
display(merged.head())
merged.Tumor_Sample_Barcode.count()
print('Merged mutations count', merged.Tumor_Sample_Barcode.count())

Unnamed: 0,Hugo_Symbol,Chromosome,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,CENTERS,row,cancer_type,bcr_patient_barcode,bcr_patient_uuid,gender,vital_status,days_to_birth,days_to_death,age_at_initial_pathologic_diagnosis,pathologic_stage,height,weight
0,TACC2,10,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,0,GBM,TCGA-02-0003,df3c1d61-79c1-43e9-971a-8029497ffeab,MALE,Dead,-18341,144.0,50,,,
1,JAKMIP3,10,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,1,GBM,TCGA-02-0003,df3c1d61-79c1-43e9-971a-8029497ffeab,MALE,Dead,-18341,144.0,50,,,
2,PANX3,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,2,GBM,TCGA-02-0003,df3c1d61-79c1-43e9-971a-8029497ffeab,MALE,Dead,-18341,144.0,50,,,
3,SPI1,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,RADIA|MUSE,3,GBM,TCGA-02-0003,df3c1d61-79c1-43e9-971a-8029497ffeab,MALE,Dead,-18341,144.0,50,,,
4,NAALAD2,11,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,MUTECT|RADIA|SOMATICSNIPER|MUSE|VARSCANS,4,GBM,TCGA-02-0003,df3c1d61-79c1-43e9-971a-8029497ffeab,MALE,Dead,-18341,144.0,50,,,


Merged mutations count 3570876


In [65]:
# Write out new merged dataframe as csv file
print("Writing new csv file with merged mutations, clinical, and cancer types ...")
merged.to_csv("pancancer_mutations_merged.csv")
print("done.")

Writing new csv file with merged mutations, clinical, and cancer types ...
done.
