# EXPLORE DATA

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.stats import chi2_contingency

In [2]:
current_dir = os.path.abspath(os.getcwd())
PROJECT_DIR = Path(current_dir).parent
DATA_DIR = os.path.join(PROJECT_DIR, "data")
EXPOSURE_PTH = os.path.join(DATA_DIR, "LUAD", "exposure.tsv")
CLINICAL_PTH = os.path.join(DATA_DIR, "LUAD", "clinical.tsv")

# Create DataFrames

In [3]:
EXPOSURE_DF = pd.read_csv(EXPOSURE_PTH, sep='\t')
EXPOSURE_DF.head()

Unnamed: 0,project.project_id,cases.case_id,cases.submitter_id,exposures.age_at_last_exposure,exposures.age_at_onset,exposures.alcohol_days_per_week,exposures.alcohol_drinks_per_day,exposures.alcohol_frequency,exposures.alcohol_history,exposures.alcohol_intensity,...,exposures.smoking_frequency,exposures.submitter_id,exposures.time_between_waking_and_first_smoke,exposures.tobacco_smoking_onset_year,exposures.tobacco_smoking_quit_year,exposures.tobacco_smoking_status,exposures.type_of_smoke_exposure,exposures.type_of_tobacco_used,exposures.use_per_day,exposures.years_smoked
0,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,TCGA-62-A471,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,2006,Current Reformed Smoker for < or = 15 yrs,'--,'--,'--,'--
1,TCGA-LUAD,009be09b-f9f6-43b7-8f45-4a648f8123ce,TCGA-67-3773,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Current Reformed Smoker for > 15 yrs,'--,'--,'--,'--
2,TCGA-LUAD,01e9888d-b5b9-48f1-8ba6-8a89af108a04,TCGA-NJ-A7XG,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Current Reformed Smoker for > 15 yrs,'--,'--,'--,'--
3,TCGA-LUAD,0232d299-4cdf-4fd7-9a5e-8d13c208b40c,TCGA-91-6848,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,2007,Current Reformed Smoker for < or = 15 yrs,'--,'--,'--,'--
4,TCGA-LUAD,028e99e9-5b9a-4954-bb6e-6d4709a3cea8,TCGA-55-6986,'--,'--,'--,'--,'--,'--,'--,...,'--,'--,'--,'--,'--,Lifelong Non-Smoker,'--,'--,'--,'--


In [4]:
CLINICAL_DF = pd.read_csv(CLINICAL_PTH, sep='\t')
CLINICAL_DF.head()

Unnamed: 0,project.project_id,cases.case_id,cases.consent_type,cases.days_to_consent,cases.days_to_lost_to_followup,cases.disease_type,cases.index_date,cases.lost_to_followup,cases.primary_site,cases.submitter_id,...,treatments.treatment_duration,treatments.treatment_effect,treatments.treatment_effect_indicator,treatments.treatment_frequency,treatments.treatment_id,treatments.treatment_intent_type,treatments.treatment_or_therapy,treatments.treatment_outcome,treatments.treatment_outcome_duration,treatments.treatment_type
0,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,6ed4e391-74f1-4a56-8fbe-b7a10081d85b,'--,yes,Complete Response,'--,Chemotherapy
1,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,7f648a1d-7e98-496e-afff-66cf765d49e0,'--,yes,Complete Response,'--,Chemotherapy
2,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,a5872c92-954a-5007-a67c-79c357655640,'--,yes,Complete Response,'--,Chemotherapy
3,TCGA-LUAD,0075437e-ba1a-46be-86d6-9773209a2b5e,Informed Consent,0,'--,Adenomas and Adenocarcinomas,Diagnosis,No,Bronchus and lung,TCGA-62-A471,...,'--,'--,'--,'--,d26f5579-57be-4e36-af37-304b01c797ad,Adjuvant,no,'--,'--,"Radiation Therapy, NOS"
4,TCGA-LUAD,009be09b-f9f6-43b7-8f45-4a648f8123ce,Informed Consent,50,'--,Adenomas and Adenocarcinomas,Diagnosis,'--,Bronchus and lung,TCGA-67-3773,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,'--


In [5]:
CLINICAL_DF["treatments.treatment_outcome"].value_counts()

treatments.treatment_outcome
'--                    2074
Complete Response       152
Progressive Disease      98
Treatment Ongoing        42
Stable Disease           40
Unknown                  35
Partial Response         25
Name: count, dtype: int64

In [6]:
for column in CLINICAL_DF.columns:
    print(column)

project.project_id
cases.case_id
cases.consent_type
cases.days_to_consent
cases.days_to_lost_to_followup
cases.disease_type
cases.index_date
cases.lost_to_followup
cases.primary_site
cases.submitter_id
demographic.age_at_index
demographic.age_is_obfuscated
demographic.cause_of_death
demographic.cause_of_death_source
demographic.country_of_birth
demographic.country_of_residence_at_enrollment
demographic.days_to_birth
demographic.days_to_death
demographic.demographic_id
demographic.education_level
demographic.ethnicity
demographic.gender
demographic.marital_status
demographic.occupation_duration_years
demographic.population_group
demographic.premature_at_birth
demographic.race
demographic.submitter_id
demographic.vital_status
demographic.weeks_gestation_at_birth
demographic.year_of_birth
demographic.year_of_death
diagnoses.adrenal_hormone
diagnoses.age_at_diagnosis
diagnoses.ajcc_clinical_m
diagnoses.ajcc_clinical_n
diagnoses.ajcc_clinical_stage
diagnoses.ajcc_clinical_t
diagnoses.ajcc_p

In [7]:
for column in CLINICAL_DF.columns:
    print(column)

project.project_id
cases.case_id
cases.consent_type
cases.days_to_consent
cases.days_to_lost_to_followup
cases.disease_type
cases.index_date
cases.lost_to_followup
cases.primary_site
cases.submitter_id
demographic.age_at_index
demographic.age_is_obfuscated
demographic.cause_of_death
demographic.cause_of_death_source
demographic.country_of_birth
demographic.country_of_residence_at_enrollment
demographic.days_to_birth
demographic.days_to_death
demographic.demographic_id
demographic.education_level
demographic.ethnicity
demographic.gender
demographic.marital_status
demographic.occupation_duration_years
demographic.population_group
demographic.premature_at_birth
demographic.race
demographic.submitter_id
demographic.vital_status
demographic.weeks_gestation_at_birth
demographic.year_of_birth
demographic.year_of_death
diagnoses.adrenal_hormone
diagnoses.age_at_diagnosis
diagnoses.ajcc_clinical_m
diagnoses.ajcc_clinical_n
diagnoses.ajcc_clinical_stage
diagnoses.ajcc_clinical_t
diagnoses.ajcc_p

In [13]:
import pandas as pd

COL1 = 'treatments.treatment_outcome'
COL2 = 'diagnoses.ajcc_pathologic_stage'
df = CLINICAL_DF
# With row percentages
contingency_pct = pd.crosstab(df[COL2], df[COL1], normalize='index') * 100

# With column percentages
contingency_pct_col = pd.crosstab(df[COL2], df[COL1], normalize='columns') * 100


In [16]:
contingency_pct_col

treatments.treatment_outcome,'--,Complete Response,Partial Response,Progressive Disease,Stable Disease,Treatment Ongoing,Unknown
diagnoses.ajcc_pathologic_stage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
'--,53.375121,1.315789,0.0,0.0,0.0,4.761905,0.0
Stage I,0.867888,0.0,0.0,1.020408,0.0,0.0,0.0
Stage IA,12.92189,10.526316,16.0,9.183673,0.0,2.380952,5.714286
Stage IB,12.92189,17.105263,20.0,10.204082,12.5,21.428571,14.285714
Stage II,0.24108,0.0,0.0,0.0,0.0,0.0,0.0
Stage IIA,4.580521,19.078947,8.0,17.346939,0.0,16.666667,20.0
Stage IIB,6.123433,23.026316,4.0,22.44898,17.5,30.952381,17.142857
Stage III,0.096432,0.0,0.0,0.0,0.0,0.0,0.0
Stage IIIA,6.075217,22.368421,48.0,22.44898,42.5,9.52381,17.142857
Stage IIIB,0.675024,2.631579,0.0,1.020408,12.5,0.0,14.285714
