In [None]:
import pandas as pd
import urllib.request
import numpy as np
from IPython.display import display

In [None]:
print('Loading merged data ...')
mutations = pd.read_csv("pancancer_mutations_merged.csv")
print("done.")
print("Mutations count", mutations['bcr_patient_barcode'].count())

In [None]:
mutations.head()

In [None]:
mutations_coding_genes = mutations[mutations['BIOTYPE'] == 'protein_coding']
mutations_non_coding_genes = mutations[mutations['BIOTYPE'] != 'protein_coding']

coding_genes = list(mutations_coding_genes['Hugo_Symbol'].unique())
non_coding_genes = list(mutations_non_coding_genes['Hugo_Symbol'].unique())
print("Number of coding genes:", len(coding_genes))
print("Number of non-coding genes:", len(non_coding_genes))
                                  



In [None]:
# Get number of cases per cancer type
mutations.groupby(['cancer_type', 'bcr_patient_barcode']).size().groupby(level=0).max()

In [None]:
unique_genes = mutations.groupby('Hugo_Symbol').count()[mutations.columns[0:1]]

In [None]:
unique_genes.columns = ['freq']

In [None]:
unique_genes.sort_values(['freq'], ascending=False)

In [None]:
unique_genes.median()

In [None]:
unique_genes.hist(bins=200, figsize=(12,4))
unique_genes[(unique_genes.freq >= 10) & (unique_genes.freq < 700)].hist(bins=100, figsize=(12,4))



In [None]:
candidate_list = unique_genes[(unique_genes.freq >= 10) & (unique_genes.freq < 700)].sort_values(['freq'], ascending=False)
print("Number of genes with case prevalence (10-700):", candidate_list['freq'].count())


In [None]:
# Get a list of the unique gene names
grouped = mutations.groupby('Hugo_Symbol',  sort=False).count() 
unique_genes = grouped[mutations.columns[0:1]]
unique_genes.columns = ['freq']
unique_genes = unique_genes.sort_values(['freq'], ascending=[0])

display(unique_genes.head(10))

print('All genes\t', len(unique_genes))
print('  min gene frequency', unique_genes.min()[0])
print('  max gene frequency', unique_genes.max()[0])
print('  avg gene frequency', int(unique_genes.median()[0]))
unique_genes.hist(bins=100, figsize=(6,4))

feature_genes = unique_genes[(unique_genes.freq > 400) & (unique_genes.freq < 700)]
#feature_genes = feature_genes[unique_genes.freq > 400]
print('\nTargeted genes\t', len(feature_genes))
print('  min gene frequency', feature_genes.min()[0])
print('  max gene frequency', feature_genes.max()[0])
print('  avg gene frequency', int(feature_genes.median()[0]))
feature_genes.hist(bins=100, figsize=(6,4))

feature_genes = pd.DataFrame(feature_genes.index)

In [None]:
# We need to get a list of unique genes for every case
cases = list()
grouped = mutations.groupby('bcr_patient_barcode')
i = int(0)

cols = ['case', 'cancer_type', 'gender', 'age_at_diag']
for gene in feature_genes.Hugo_Symbol:
    cols.append(gene)


for name, group in grouped:
    case = list()
    case.append(name)
    for cc in group.cancer_type.head(1):
        case.append(cc)
    for gender in group.gender.head(1):
        case.append(gender)
    for diag_age in group.age_at_initial_pathologic_diagnosis.head(1):
        case.append(diag_age)
    for gene_flag  in feature_genes.Hugo_Symbol.isin(group.Hugo_Symbol.unique()):
        switch = 0
        if gene_flag == True:
            switch = 1
        case.append(switch)
    cases.append(case)
 

cases_df = pd.DataFrame(cases)
cases_df.columns = cols
print("Number of rows in full dataset", cases_df.case.count())


In [None]:
cases_df

In [None]:
# Write out transformed data to csv
print("Writing case feature matrix to csv ...")
cases_df.to_csv("pancancer_case_features.csv")
print("done.")