In [2]:
import pandas as pd
lawrence = pd.read_csv('lawrence_ST2.txt', sep='\t')
oncotree = pd.read_csv('../oncotree/oncotree_2018-06-01.txt', sep='\t')

formatted_columns = ['sample', 'tumor_type_tcga', 'n_coding_mutations', 'non-silent per Mb', 
                     'ontology', 'code', 'datatype']
df = pd.DataFrame(columns = formatted_columns, index=lawrence.index)

df['sample'] = lawrence['name']
df['tumor_type_tcga'] = lawrence['tumor_type']
df['datatype'] = lawrence['datatype']
df['n_coding_mutations'] = lawrence['n_coding_mutations']
df['non-silent per Mb'] = lawrence['coding_mutation_rate'].multiply(10**6)

map_dict = {}
map_dict['Acute myeloid leukemia'] = ('Acute Myeloid Leukemia', 'AML')
map_dict['Bladder'] = ('Bladder Urothelial Carcinoma', 'BLCA')
map_dict['Breast'] = ('Invasive Breast Carcinoma', 'BRCA')

# Small intestine neuroendocrine Francis, J.M. et al. Somatic mutation of CDKN1B in small intestine neuroendocrine tumors. Nat Genet (2013).
map_dict['Carcinoid'] = ('Small Bowel Well-Differentiated Neuroendocrine Tumor', 'SBWDNET')
map_dict['Cervical'] = ('Cervical Squamous Cell Carcinoma', 'CESC')
map_dict['Chronic lymphocytic leukemia'] = ('Chronic Lymphocytic Leukemia/Small Lymphocytic Lymphoma', 'CLLSLL')

map_dict['Colorectal'] = ('Colorectal Adenocarcinoma', 'COADREAD')
map_dict['Diffuse large B-cell lymphoma'] = ('Diffuse Large B-Cell Lymphoma, NOS', 'DLBCLNOS')
map_dict['Esophageal adenocarcinoma'] = ('Esophageal Adenocarcinoma', 'ESCA')

map_dict['Ewing Sarcoma'] = ('Ewing Sarcoma', 'ES')
map_dict['Glioblastoma multiforme'] = ('Glioblastoma Multiforme', 'GBM')
map_dict['Head and neck'] = ('Head and Neck Squamous Cell Carcinoma', 'HNSC')

map_dict['Kidney clear cell'] = ('Renal Clear Cell Carcinoma', 'CCRCC') 
map_dict['Kidney papillary cell'] = ('Papillary Renal Cell Carcinoma', 'PRCC')
map_dict['Low-grade glioma'] = ('Low-Grade Glioma, NOS', 'LGGNOS')

map_dict['Lung adenocarcinoma'] = ('Lung Adenocarcinoma', 'LUAD')
map_dict['Lung squamous cell carcinoma'] = ('Lung Squamous Cell Carcinoma', 'LUSC')
map_dict['Medulloblastoma'] = ('Medulloblastoma', 'MBL')

map_dict['Melanoma'] = ('Cutaneous Melanoma', 'SKCM')
map_dict['Multiple myeloma'] = ('Plasma Cell Myeloma', 'PCM')
map_dict['Neuroblastoma'] = ('Neuroblastoma', 'NBL')

map_dict['Ovarian'] = ('Serous Ovarian Cancer', 'SOC')
map_dict['Pancreas'] = ('Pancreatic Adenocarcinoma', 'PAAD')
map_dict['Prostate'] = ('Prostate Adenocarcinoma', 'PRAD')

# TCGA had 20 brain, 3 kidney, and 9 soft tissue https://www.jci.org/articles/view/64400
map_dict['Rhabdoid tumor'] = ('Atypical Teratoid/Rhabdoid Tumor', 'ATRT')
map_dict['Stomach'] = ('Stomach Adenocarcinoma', 'STAD')
map_dict['Thyroid'] = ('Papillary Thyroid Cancer', 'THPA')

for tumor_type in df['tumor_type_tcga'].unique():
    idx = df[df['tumor_type_tcga'] == tumor_type].index
    df.loc[idx, 'ontology'] = map_dict[tumor_type][0]
    df.loc[idx, 'code'] = map_dict[tumor_type][1]

outname = 'lawrence_mapped_ontology.txt'
df.to_csv(outname, sep='\t', index=False)