In [None]:
import pandas as pd
import urllib.request
import numpy as np
import matplotlib as plt
from IPython.display import display

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
print('Loading merged data ...')
mutations_raw = pd.read_csv("pancancer_mutations_merged.csv", 
                        usecols=['cancer_type', 'bcr_patient_barcode', 'Hugo_Symbol', 'BIOTYPE'])
print("done.")
print("Mutations count", mutations['bcr_patient_barcode'].count())

In [None]:
mutations_raw.head()

In [None]:
mutations = mutations_raw[mutations_raw['BIOTYPE'] == 'protein_coding']
mutations_non_coding_genes = mutations_raw[mutations_raw['BIOTYPE'] != 'protein_coding']

coding_genes = list(mutations['Hugo_Symbol'].unique())
non_coding_genes = list(mutations_non_coding_genes['Hugo_Symbol'].unique())
print("Number of coding genes:", len(coding_genes))
print("Number of non-coding genes:", len(non_coding_genes))
                                  



In [None]:
# Print out the number of cancer types that are present in the 
# mutations dataset
cancer_types = mutations['cancer_type'].unique()
print("\nNumber of cancer types", len(cancer_types))
print(np.sort(cancer_types))



# Get number of cases per cancer type
group_patients_by_cancer = mutations.groupby(['cancer_type'])['bcr_patient_barcode'].nunique()
print("\nNumber of patients", group_patients_by_cancer.sum())
group_patients_by_cancer.plot.bar(figsize=(12,4))

In [None]:
# Get the unique genes per cancer type
group_genes_by_cancer = mutations.groupby(['cancer_type'])['Hugo_Symbol'].nunique();
group_genes_by_cancer.plot.bar(figsize=(12,4))
print("Mean number of genes represented for each cancer type:", int(np.round(group_genes_by_cancer.mean())))
print("Min number of genes represented for each cancer type:", int(np.round(group_genes_by_cancer.min())))
print("Max number of genes represented for each cancer type:", int(np.round(group_genes_by_cancer.max())))

In [None]:
top_n_genes = 500

# Now try to find the most common genes per cancer type and
# merge these together to come up with a master list
cancer_gene_count = mutations.groupby(['cancer_type', 'Hugo_Symbol'])['bcr_patient_barcode'].nunique().reset_index(name='count')
cancer_gene_count.columns = ['cancer_type', 'gene', 'patient_count']



# Now create a large matrix, row is the gene, column for each cancer type
df = pd.DataFrame(cancer_gene_count, columns=['cancer_type', 'gene', 'patient_count'])
gene_cancer_matrix = pd.pivot_table(df, values='patient_count', index=['gene'],
                     columns=['cancer_type'], aggfunc=np.sum, fill_value=0)


# Now find the top n genes for each cancer type
top_genes = []
for cancer_type in gene_cancer_matrix.columns:
    sorted_genes = gene_cancer_matrix[cancer_type].sort_values(ascending=False)
    top_rows = sorted_genes[sorted_genes > 0].head(top_n_genes)
    for gene, patient_count in top_rows.items():
        top_genes.append(list([cancer_type, gene, patient_count]))
            

# Turn this back into a matrix, row is gene, column for each cancer type
top_df = pd.DataFrame(top_genes, columns=['cancer_type', 'gene', 'patient_count'])
top_gene_cancer_matrix = pd.pivot_table(top_df, values='patient_count', index=['gene'],
                     columns=['cancer_type'], aggfunc=np.sum, fill_value=0)
top_gene_cancer_matrix.head()
print(top_gene_cancer_matrix.shape)



    
    

In [None]:
# Now try to find the most common genes across all cancer types
gene_count = mutations.groupby(['Hugo_Symbol'])['bcr_patient_barcode'].nunique().reset_index(name='count')
gene_count.columns = ['gene', 'patient_count']
gene_count = gene_count.sort_values(['patient_count', 'gene'], ascending=[0,1])
print('Genes by patient frequency')
print("  mean:", int(gene_count['patient_count'].mean()))
print("  min: ", int(gene_count['patient_count'].min()))
print("  max: ", int(gene_count['patient_count'].max()))
gene_count.head(10)

gene_count['patient_count'].hist(bins=200, figsize=(12,4))


In [None]:
feature_genes = top_gene_cancer_matrix.index


In [None]:
# We need to get a list of unique genes for every case
cases = list()
grouped = mutations.groupby('bcr_patient_barcode')
i = int(0)

cols = ['case_id', 'cancer_type']
for gene in feature_genes:
    cols.append(gene)


for name, group in grouped:
    case = list()
    case.append(name)
    for cc in group.cancer_type.head(1):
        case.append(cc)

    for gene_flag  in feature_genes.isin(group.Hugo_Symbol.unique()):
        switch = 0
        if gene_flag == True:
            switch = 1
        case.append(switch)
    cases.append(case)
 

cases_df = pd.DataFrame(cases)
cases_df.columns = cols



In [None]:
print("Number of rows in full dataset", cases_df.case_id.count())

In [None]:
cases_df.shape
cases_df.head()


In [None]:
# Write out transformed data to csv
print("Writing case feature matrix to csv ...")
cases_df.to_csv("pancancer_case_features.csv")
print("done.")