In [2]:
import pandas as pd
import numpy as np
import pyreadr
import json

In [3]:
#load expression, genes and samples data
expression=pd.read_csv("C:/Users/joann/Desktop/M2/Deep_Learning/data/gbm_expression.csv")
genes=pd.read_csv("C:/Users/joann/Desktop/M2/Deep_Learning/data/gbm_genes.csv")
samples = pyreadr.read_r("C:/Users/joann/Desktop/M2/Deep_Learning/data/gbm_samples.rds")


In [4]:
print(samples.keys())

#extract the dataframe from the dictionary 
key = next(iter(samples.keys()))
samples = samples[key]

print(type(samples))
print(samples.head())


odict_keys([None])
<class 'pandas.core.frame.DataFrame'>
                                                   barcode       patient  \
rownames                                                                   
TCGA-06-6390-01A-11R-A96S-41  TCGA-06-6390-01A-11R-A96S-41  TCGA-06-6390   
TCGA-06-5411-01A-01R-1849-01  TCGA-06-5411-01A-01R-1849-01  TCGA-06-5411   
TCGA-06-5411-01A-01R-A96S-41  TCGA-06-5411-01A-01R-A96S-41  TCGA-06-5411   
TCGA-12-3648-01A-01R-A96T-41  TCGA-12-3648-01A-01R-A96T-41  TCGA-12-3648   
TCGA-06-A7TK-01A-21R-A96S-41  TCGA-06-A7TK-01A-21R-A96S-41  TCGA-06-A7TK   

                                        sample shortLetterCode  \
rownames                                                         
TCGA-06-6390-01A-11R-A96S-41  TCGA-06-6390-01A              TP   
TCGA-06-5411-01A-01R-1849-01  TCGA-06-5411-01A              TP   
TCGA-06-5411-01A-01R-A96S-41  TCGA-06-5411-01A              TP   
TCGA-12-3648-01A-01R-A96T-41  TCGA-12-3648-01A              TP   
TCGA-06-A7TK-0

In [5]:
#explore the samples metadata
print(samples.shape)
# list of column names
samples.columns.tolist()


(391, 106)


['barcode',
 'patient',
 'sample',
 'shortLetterCode',
 'definition',
 'sample_submitter_id',
 'intermediate_dimension',
 'tumor_descriptor',
 'sample_id',
 'pathology_report_uuid',
 'submitter_id',
 'shortest_dimension',
 'sample_type',
 'specimen_type',
 'longest_dimension',
 'days_to_collection',
 'state',
 'initial_weight',
 'tissue_type',
 'preservation_method',
 'morphology',
 'days_to_diagnosis',
 'treatments',
 'tissue_or_organ_of_origin',
 'age_at_diagnosis',
 'primary_diagnosis',
 'classification_of_tumor',
 'tumor_of_origin',
 'diagnosis_id',
 'site_of_resection_or_biopsy',
 'prior_treatment',
 'diagnosis_is_primary_disease',
 'synchronous_malignancy',
 'days_to_last_follow_up',
 'prior_malignancy',
 'year_of_diagnosis',
 'method_of_diagnosis',
 'icd_10_code',
 'race',
 'gender',
 'ethnicity',
 'vital_status',
 'age_at_index',
 'days_to_birth',
 'demographic_id',
 'age_is_obfuscated',
 'days_to_death',
 'country_of_residence_at_enrollment',
 'bcr_patient_barcode',
 'primary_

In [6]:
samples_na=samples.isna().sum()
print(samples_na)


barcode                                                  0
patient                                                  0
sample                                                   0
shortLetterCode                                          0
definition                                               0
                                                      ... 
paper_Supervised.DNA.Methylation.Cluster               129
paper_Random.Forest.Sturm.Cluster                      298
paper_RPPA.cluster                                     231
paper_Telomere.length.estimate.in.blood.normal..Kb.    325
paper_Telomere.length.estimate.in.tumor..Kb.           325
Length: 106, dtype: int64


In [7]:
patterns = {
    "survival_or_time": ["days", "death", "follow", "surviv", "vital", "progress", "recurr"],
    "demographics": ["age", "gender", "sex", "race", "ethnic"],
    "tumor_clinical": ["grade", "stage", "diagnos", "tumor", "histolog", "subtype", "idh", "mgmt"],
    "treatment": ["therapy", "radiat", "chemo", "temoz", "treatment", "drug"],
    "technical_batch": ["center", "plate", "batch", "analyte", "aliquot", "portion", "tss"],
}

for group, keys in patterns.items():
    hits = [c for c in samples.columns if any(k in c.lower() for k in keys)]
    print(group, "->", hits[:30])


survival_or_time -> ['days_to_collection', 'days_to_diagnosis', 'days_to_last_follow_up', 'vital_status', 'days_to_birth', 'days_to_death', 'paper_Survival..months.', 'paper_Vital.status..1.dead.']
demographics -> ['age_at_diagnosis', 'race', 'gender', 'ethnicity', 'age_at_index', 'age_is_obfuscated', 'paper_Age..years.at.diagnosis.', 'paper_Gender']
tumor_clinical -> ['tumor_descriptor', 'days_to_diagnosis', 'age_at_diagnosis', 'primary_diagnosis', 'classification_of_tumor', 'tumor_of_origin', 'diagnosis_id', 'diagnosis_is_primary_disease', 'year_of_diagnosis', 'method_of_diagnosis', 'paper_Histology', 'paper_Grade', 'paper_Age..years.at.diagnosis.', 'paper_IDH.status', 'paper_IDH.codel.subtype', 'paper_MGMT.promoter.status', 'paper_Original.Subtype', 'paper_Transcriptome.Subtype', 'paper_IDH.specific.RNA.Expression.Cluster', 'paper_IDH.specific.DNA.Methylation.Cluster', 'paper_Telomere.length.estimate.in.tumor..Kb.']
treatment -> ['treatments', 'prior_treatment']
technical_batch -> [

In [8]:
print(expression.shape)
print(genes.shape)

(60660, 392)
(60660, 10)


In [10]:
#compare expression IDs between expression and metadata (diagnose dimension mismatch)
expr_ids = expression.columns.astype(str)
meta_ids = samples["sample_id"].astype(str)

missing_in_meta = sorted(set(expr_ids) - set(meta_ids))
missing_in_expr = sorted(set(meta_ids) - set(expr_ids))

print("Missing in metadata (expr-only):", len(missing_in_meta))
print(missing_in_meta[:20])

print("Missing in expression (meta-only):", len(missing_in_expr))
print(missing_in_expr[:20])

#there are actually 391 samples, but we just treated the gene_id column as a sample column which led to the dimension mismatch
#we set gene_id column as index
expression = expression.set_index("gene_id")
print(expression.shape)

Missing in metadata (expr-only): 1
['gene_id']
Missing in expression (meta-only): 0
[]
(60660, 391)
