# Notebook to format gene and clinical data for the TCGA LUAD Cohort

Here we will choose the GDC version of the data (apparently PanCancer might be better), but for consitency with CPTAC

The goal of this notebook is to format the data in a simple table whose rows are patient_id and column (gene mutation status + clinical)

Things to do:
 - [ ] Select which cohorts to download
 - [ ] Download all the data from cBioPortal
 - [ ] Download the gene data from cBioPortal by querying the list in 'gene_query.txt'
 - [ ] Format the data

In [1]:
import pandas as pd

In [30]:
# df_clinical = pd.read_csv("/home/val/workspaces/histotab/data/raw/TCGA_LUAD/luad_tcga_firehose_legacy/data_clinical_patient.txt", sep="\t", comment="#", skip_blank_lines=True)
# df_clinical = pd.read_csv("/home/val/workspaces/histotab/data/raw/TCGA_LUAD/luad_tcga_gdc/data_clinical_patient.txt", sep="\t", comment="#", skip_blank_lines=True)
df_clinical = pd.read_csv("/home/val/workspaces/histotab/data/raw/TCGA_LUAD/luad_tcga_pan_can_atlas_2018/data_clinical_patient.txt", sep="\t", comment="#", skip_blank_lines=True)

In [31]:
morhpology_col = "ICD_O_3_HISTOLOGY"
# morhpology_col = "MORPHOLOGY"

In [32]:
df_clinical.loc[:,morhpology_col].unique()

array(['8140/3', '8255/3', '8550/3', '8480/3', '8260/3', '8310/3',
       '8252/3', '8253/3', '8230/3', '8507/3', '8250/3', '8490/3', nan],
      dtype=object)

In [33]:
histology_map = {
    "8140/3": "Adenocarcinoma, NOS",
    "8255/3": "Adenocarcinoma with mixed subtypes",
    "8260/3": "Papillary adenocarcinoma, NOS",
    "8550/3": "Acinar cell carcinoma",
    "8480/3": "Mucinous adenocarcinoma",
    "8310/3": "Clear cell adenocarcinoma, NOS",
    "8252/3": "Bronchiolo-alveolar carcinoma, non-mucinous",
    "8253/3": "Invasive mucinous adenocarcinoma",
    "8230/3": "Solid carcinoma, NOS",
    "8507/3": "Invasive micropapillary carcinoma",
    "8250/3": "Lepidic adenocarcinoma",
    "8490/3": "Signet ring cell carcinoma",
    }

In [34]:
df_clinical[df_clinical[morhpology_col] == "8250/3"]

Unnamed: 0,PATIENT_ID,SUBTYPE,CANCER_TYPE_ACRONYM,OTHER_PATIENT_ID,AGE,SEX,AJCC_PATHOLOGIC_TUMOR_STAGE,AJCC_STAGING_EDITION,DAYS_LAST_FOLLOWUP,DAYS_TO_BIRTH,...,IN_PANCANPATHWAYS_FREEZE,OS_STATUS,OS_MONTHS,DSS_STATUS,DSS_MONTHS,DFS_STATUS,DFS_MONTHS,PFS_STATUS,PFS_MONTHS,GENETIC_ANCESTRY_LABEL
373,TCGA-86-7953,LUAD,LUAD,54775a66-08cc-4f38-98f2-e7b2b5cec994,69.0,Female,STAGE IA,7TH,997.0,-25315.0,...,Yes,0:LIVING,32.777723,0:ALIVE OR DEAD TUMOR FREE,32.777723,0:DiseaseFree,32.777723,0:CENSORED,32.777723,EUR
374,TCGA-86-7954,LUAD,LUAD,079ae0b3-b64b-4b8e-ab7d-225b8046568c,68.0,Female,STAGE IB,7TH,605.0,-25062.0,...,Yes,0:LIVING,19.890193,0:ALIVE OR DEAD TUMOR FREE,19.890193,0:DiseaseFree,19.890193,0:CENSORED,19.890193,EUR
379,TCGA-86-8073,LUAD,LUAD,cd902d08-215e-4bd0-88e4-4fd01ab43cbf,58.0,Male,STAGE IB,7TH,740.0,-21214.0,...,Yes,0:LIVING,24.328501,0:ALIVE OR DEAD TUMOR FREE,24.328501,0:DiseaseFree,24.328501,0:CENSORED,24.328501,EUR


In [35]:
df_clinical["HISTOLOGIC_SUBTYPE"] = df_clinical[morhpology_col].map(histology_map)

In [36]:
df_clinical["HISTOLOGIC_SUBTYPE"].unique()

array(['Adenocarcinoma, NOS', 'Adenocarcinoma with mixed subtypes',
       'Acinar cell carcinoma', 'Mucinous adenocarcinoma',
       'Papillary adenocarcinoma, NOS', 'Clear cell adenocarcinoma, NOS',
       'Bronchiolo-alveolar carcinoma, non-mucinous',
       'Invasive mucinous adenocarcinoma', 'Solid carcinoma, NOS',
       'Invasive micropapillary carcinoma', 'Lepidic adenocarcinoma',
       'Signet ring cell carcinoma', nan], dtype=object)

In [37]:
subtype_counts = df_clinical["HISTOLOGIC_SUBTYPE"].value_counts().sort_values(ascending=False)

In [38]:
subtype_counts

HISTOLOGIC_SUBTYPE
Adenocarcinoma, NOS                            310
Adenocarcinoma with mixed subtypes             107
Acinar cell carcinoma                           24
Papillary adenocarcinoma, NOS                   21
Bronchiolo-alveolar carcinoma, non-mucinous     19
Mucinous adenocarcinoma                         14
Invasive mucinous adenocarcinoma                 5
Solid carcinoma, NOS                             5
Lepidic adenocarcinoma                           3
Invasive micropapillary carcinoma                3
Clear cell adenocarcinoma, NOS                   2
Signet ring cell carcinoma                       1
Name: count, dtype: int64