In [1]:
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data Collection #

For our analysis of cancer prediction using gene mutation and clinical data from patients, we will gather data from multiple sources. First we obtain the gene mutation data from the $cancer.gov$ website. Next we get the patient clinical data also from the $cancer.gov$ website. In our notebook, we store this data locally so that it does not have to downloaded if the notebook kernel is restarted and run multiple times.

In [2]:
# This downloads a 753 MB mutations gzip file.  This will take about 1-5 mins depending on your
# connection speed.
filename = "pancancer_mutations.maf.gz"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading mutation data. 753 MB (may take a few minutes)...')
    url = 'http://api.gdc.cancer.gov/data/1c8cfe5f-e52d-41ba-94da-f15ea1337efc'  
    urllib.request.urlretrieve(url, './pancancer_mutations.maf.gz')  
print("done.")

Skipping download, as file pancancer_mutations.maf.gz is present
done.


In [3]:
# This downloads an 18 MB clinical data file
filename = "pancancer_clinical.txt"
if os.path.isfile(filename):
    print("Skipping download, as file %s is present" %(filename))
else:
    print('Downloading clinical data ...')
    url = 'http://api.gdc.cancer.gov/data/0fc78496-818b-4896-bd83-52db1f533c5c'  
    urllib.request.urlretrieve(url, './pancancer_clinical.txt')  
print("done.")

Skipping download, as file pancancer_clinical.txt is present
done.


## Loading Gene Mutation Data ##

Here we read the gene mutation data. This data file contains many columns, but after careful curation, we have decided to consider the following colums:

1. **Tumor_Sample_Barcode**: this contains the barcode with the first 12 characters identifying the patient
2. **Hugo_Symbol**: this is the actual gene that has been mutated (for e.g. TACC2, JAKMIP3, PANX3)
3. **BIOTYPE**: this indicates if the gene is protein coding or not.
4. **Chromosome**  **Start_Position** **End_Position** **Strand**: the chromosome, start position and end position tells us the location of the gene where the mutation is seen.  Strand indicates if it is on the forward or reverse strand of the DNA.
5. **Variant_Type**: this indicates if it is a single substitution mutation (SNP), a small deletion (DEL), or small insertion (INS), two nucleotide substitution (DNP), three nucleotide substitution (TNP), or more that three nucleotide substitution (ONP)
6. **Variant_Classification**: this indicates what kind of molecular effect that this mutation will have on the protein.  The most common classes indicate if the substitution causes a change to the amino acid (missense vs silent).  Nonsense mutations cause premature termination of the protein; frameshift mutations cause a misreading of the amino acid sequence.
7. **IMPACT**: this indicates how damaging the mutation -- HIGH, MODERATE, MODIFIER, or LOW.


In [4]:
# Load the mutations dataframe
print('Loading mutations dataframe ...')
mutations = pd.read_csv("pancancer_mutations.maf.gz", compression='gzip',
                        sep='\t',
                        usecols=['Tumor_Sample_Barcode','Hugo_Symbol', 'BIOTYPE',
                                'Chromosome', 'Start_Position',  'End_Position', 'Strand',
                                'Variant_Type',  'Variant_Classification', 'IMPACT' ])
print("done.")
print("Mutations count", mutations.Tumor_Sample_Barcode.count())

Loading mutations dataframe ...


  interactivity=interactivity, compiler=compiler, result=result)


done.
Mutations count 3600963


In [5]:
mutations.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,BIOTYPE,IMPACT
0,TACC2,10,123810032,123810032,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
1,JAKMIP3,10,133967449,133967449,+,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,LOW
2,PANX3,11,124489539,124489539,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
3,SPI1,11,47380512,47380512,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
4,NAALAD2,11,89868837,89868837,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE


In [6]:
len(mutations['Hugo_Symbol'].unique())
list(mutations['BIOTYPE'].unique())
list(mutations['IMPACT'].unique())
list(mutations['Variant_Classification'].unique())
list(mutations['Variant_Type'].unique())

21332

['protein_coding',
 'transcribed_unprocessed_pseudogene',
 'IG_V_gene',
 'processed_transcript',
 'lincRNA',
 'miRNA',
 'TR_V_gene',
 'antisense',
 'snoRNA',
 'IG_C_gene',
 'nonsense_mediated_decay',
 'TR_C_gene',
 'rRNA',
 'polymorphic_pseudogene',
 'retained_intron',
 'TR_J_gene',
 'snRNA',
 'unprocessed_pseudogene',
 'sense_overlapping',
 'misc_RNA',
 'transcribed_processed_pseudogene',
 'processed_pseudogene',
 'sense_intronic',
 '3prime_overlapping_ncrna',
 'pseudogene',
 'IG_J_gene',
 'IG_D_gene',
 'unitary_pseudogene']

['MODERATE', 'LOW', 'MODIFIER', 'HIGH']

['Missense_Mutation',
 'Silent',
 "5'Flank",
 "3'UTR",
 'RNA',
 'In_Frame_Del',
 'Nonsense_Mutation',
 'Splice_Site',
 'Intron',
 "5'UTR",
 'In_Frame_Ins',
 'Frame_Shift_Del',
 'Nonstop_Mutation',
 "3'Flank",
 'Frame_Shift_Ins',
 'Translation_Start_Site']

['SNP', 'DEL', 'INS', 'TNP', 'ONP']

In [7]:
# Set mutations index
mutations['row'] = np.arange(len(mutations))
mutations.set_index('row')

Unnamed: 0_level_0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,BIOTYPE,IMPACT
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,TACC2,10,123810032,123810032,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
1,JAKMIP3,10,133967449,133967449,+,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,LOW
2,PANX3,11,124489539,124489539,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
3,SPI1,11,47380512,47380512,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
4,NAALAD2,11,89868837,89868837,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
5,FAT3,11,92570936,92570936,+,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,LOW
6,MTERFD3,12,107371855,107371855,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
7,BTBD11,12,108012011,108012011,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE
8,NOS1,12,117768962,117768962,+,5'Flank,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODIFIER
9,SLC2A14,12,7980269,7980269,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE


The actual cancer type for each of the gene mutations above is kept in a different file. The entries are located at corresponding rows, so we use that to merge the cancer type from the labels.txt file.

In [8]:
# Read the cancertypes.  These are the matching labels for the mutations file
print("Loading labels dataframe ...")
labels = pd.read_csv("labels.txt", sep='\t')
labels.columns = ["cancer_type"]
labels['row'] = np.arange(len(labels))
labels.set_index('row')
print('Labels count', labels.cancer_type.count())

Loading labels dataframe ...


Unnamed: 0_level_0,cancer_type
row,Unnamed: 1_level_1
0,GBM
1,GBM
2,GBM
3,GBM
4,GBM
5,GBM
6,GBM
7,GBM
8,GBM
9,GBM


Labels count 3600963


As we can see, there are **3600963** data points in both the gene dataframe and the cancer label dataframe. So now we merge the label information into the main mutations dataframe. The idea is that when we bring in the patient information, we will go from a _row for each gene_ to a _row for each patient_, with all the gene mutation data for that patient added as columns. With this, the cancer type (label) will also gets included as a column. 

In [9]:
# Merge the labels with the mutations
mutations = mutations.merge(labels, left_on='row', right_on='row')
display(mutations.head())

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Tumor_Sample_Barcode,BIOTYPE,IMPACT,row,cancer_type
0,TACC2,10,123810032,123810032,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE,0,GBM
1,JAKMIP3,10,133967449,133967449,+,Silent,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,LOW,1,GBM
2,PANX3,11,124489539,124489539,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE,2,GBM
3,SPI1,11,47380512,47380512,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE,3,GBM
4,NAALAD2,11,89868837,89868837,+,Missense_Mutation,SNP,TCGA-02-0003-01A-01D-1490-08,protein_coding,MODERATE,4,GBM


## Loading Patient Data##

Here we load the clinical data. This is data for patients for whom we collected the gene mutation data above. The patients are identified by $bcr\_patient\_barcode$. We will use this field to populate the gene mutation data from the dataframe above in the table we are about to read. The clinical data has patient information such as age, gender etc.

In [10]:
# Load the clinical data
print('Loading clinical dataframe ...')
clinical = pd.read_csv("pancancer_clinical.txt", sep='\t',
                        usecols=['bcr_patient_barcode', 'gender', 
                                 'vital_status', 'days_to_birth', 'days_to_death', 
                                 'age_at_initial_pathologic_diagnosis', 'pathologic_stage', 
                                 'height', 'weight', 'menopause_status', 'race', 'ethnicity'])

print('Clinical count', clinical.bcr_patient_barcode.count())
display(clinical.head())

Loading clinical dataframe ...
Clinical count 10956


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,bcr_patient_barcode,gender,vital_status,days_to_birth,days_to_death,age_at_initial_pathologic_diagnosis,race,ethnicity,pathologic_stage,height,weight,menopause_status
0,TCGA-OR-A5J1,MALE,Dead,-21496,1355.0,58,WHITE,[Not Evaluated],Stage II,,,
1,TCGA-OR-A5J2,FEMALE,Dead,-16090,1677,44,WHITE,HISPANIC OR LATINO,Stage IV,,,
2,TCGA-OR-A5J3,FEMALE,Alive,-8624,[Not Applicable],23,WHITE,HISPANIC OR LATINO,Stage III,,,
3,TCGA-OR-A5J4,FEMALE,Dead,-8451,423,23,WHITE,HISPANIC OR LATINO,Stage IV,,,
4,TCGA-OR-A5J5,MALE,Dead,-11171,365,30,WHITE,HISPANIC OR LATINO,Stage III,,,


## Creating Merged Data ##

Now that we have both gene and cancer data in one dataframe, and the patient clinical data in another dataframe, we will use the **bcr_patient_barcode** to merge these into a single table. With this, we can drop the Tumor_Sample_Barcode column, since it has served its purpose. Looking at the data, it seems like some patient data is missing from the gene data. Simultaneously, some data in the gene dataframe does not have corresponding clinical data. Hence our merged dataframe size will be lower than the original mutations dataframe size.

In [16]:
# Get the patient barcode.  This is what we will use to join the mutations to the clinical data
def parsePatientBarcode(tumor_sample_barcode):
        return tumor_sample_barcode[0:12]

mutations['bcr_patient_barcode'] = mutations['Tumor_Sample_Barcode'].apply(parsePatientBarcode)
mutations = mutations.drop(['Tumor_Sample_Barcode'], axis=1)
mutations.head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,BIOTYPE,IMPACT,row,cancer_type,bcr_patient_barcode
0,TACC2,10,123810032,123810032,+,Missense_Mutation,SNP,protein_coding,MODERATE,0,GBM,TCGA-02-0003
1,JAKMIP3,10,133967449,133967449,+,Silent,SNP,protein_coding,LOW,1,GBM,TCGA-02-0003
2,PANX3,11,124489539,124489539,+,Missense_Mutation,SNP,protein_coding,MODERATE,2,GBM,TCGA-02-0003
3,SPI1,11,47380512,47380512,+,Missense_Mutation,SNP,protein_coding,MODERATE,3,GBM,TCGA-02-0003
4,NAALAD2,11,89868837,89868837,+,Missense_Mutation,SNP,protein_coding,MODERATE,4,GBM,TCGA-02-0003


In [17]:
clinical['bcr_patient_barcode'].isnull().values.any()

False

In [18]:
missing_count = 0
gene_barcode_set = set(mutations.bcr_patient_barcode.unique())
for bcr in gene_barcode_set:
    if bcr not in set(clinical.bcr_patient_barcode.unique()):
        missing_count += 1
print("%d patients with gene data missing in clinical data" %missing_count)

216 patients with gene data missing in clinical data


In [19]:
merged = mutations.merge(clinical, left_on='bcr_patient_barcode', right_on='bcr_patient_barcode')
display(merged.head())
merged.bcr_patient_barcode.count()
print('Merged mutations count', merged.bcr_patient_barcode.count())

Unnamed: 0,Hugo_Symbol,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,BIOTYPE,IMPACT,row,...,vital_status,days_to_birth,days_to_death,age_at_initial_pathologic_diagnosis,race,ethnicity,pathologic_stage,height,weight,menopause_status
0,TACC2,10,123810032,123810032,+,Missense_Mutation,SNP,protein_coding,MODERATE,0,...,Dead,-18341,144.0,50,WHITE,NOT HISPANIC OR LATINO,,,,
1,JAKMIP3,10,133967449,133967449,+,Silent,SNP,protein_coding,LOW,1,...,Dead,-18341,144.0,50,WHITE,NOT HISPANIC OR LATINO,,,,
2,PANX3,11,124489539,124489539,+,Missense_Mutation,SNP,protein_coding,MODERATE,2,...,Dead,-18341,144.0,50,WHITE,NOT HISPANIC OR LATINO,,,,
3,SPI1,11,47380512,47380512,+,Missense_Mutation,SNP,protein_coding,MODERATE,3,...,Dead,-18341,144.0,50,WHITE,NOT HISPANIC OR LATINO,,,,
4,NAALAD2,11,89868837,89868837,+,Missense_Mutation,SNP,protein_coding,MODERATE,4,...,Dead,-18341,144.0,50,WHITE,NOT HISPANIC OR LATINO,,,,


3570876

Merged mutations count 3570876


In [20]:
# Write out new merged dataframe as csv file
print("Writing new csv file with merged mutations, clinical, and cancer types ...")
merged.to_csv("pancancer_mutations_merged.csv")
print("done.")

Writing new csv file with merged mutations, clinical, and cancer types ...
done.
