In [2]:
import numpy as np
import pandas as pd

## Investigating the available clinical data

In this script, we will explore the clinical information available from the TCGA glioblastoma (GBM) transcriptome profiling (391 files). 
We aim to understand which clinical metadata is publically available to guide our analysis. 

In [3]:
# loading data
clinical = pd.read_csv("../data/clinical_gbm.csv")
clinical.head()

Unnamed: 0,id,case_id,submitter_id,diagnoses,demographic.race,demographic.gender,demographic.ethnicity
0,025a7401-a65d-4ea0-8b4e-0ba775b0322a,025a7401-a65d-4ea0-8b4e-0ba775b0322a,TCGA-12-0819,"[{'age_at_diagnosis': 18588, 'primary_diagnosi...",black or african american,female,not hispanic or latino
1,e3711a9b-6d4c-44df-bbab-0a675046a5df,e3711a9b-6d4c-44df-bbab-0a675046a5df,TCGA-06-0208,"[{'age_at_diagnosis': 19257, 'primary_diagnosi...",white,female,not hispanic or latino
2,cc1459be-de8f-482e-9efe-65937db9dc45,cc1459be-de8f-482e-9efe-65937db9dc45,TCGA-12-1601,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
3,d75996d6-9f02-4478-a4a1-dfa7ab41de77,d75996d6-9f02-4478-a4a1-dfa7ab41de77,TCGA-06-0131,"[{'days_to_last_follow_up': None, 'age_at_diag...",not reported,not reported,not reported
4,883dc176-925a-44f7-9ec8-e0cce33c2a54,883dc176-925a-44f7-9ec8-e0cce33c2a54,TCGA-02-0099,"[{'days_to_last_follow_up': 106.0, 'age_at_dia...",white,male,not hispanic or latino


In [5]:
# column submitter_id
print(clinical["submitter_id"].unique())
print(len(clinical["submitter_id"]))

['TCGA-12-0819' 'TCGA-06-0208' 'TCGA-12-1601' 'TCGA-06-0131'
 'TCGA-02-0099' 'TCGA-02-2470' 'TCGA-02-0024' 'TCGA-06-0154'
 'TCGA-27-1838' 'TCGA-06-0210' 'TCGA-06-0128' 'TCGA-06-0133'
 'TCGA-14-1451' 'TCGA-06-0124' 'TCGA-14-3476' 'TCGA-02-0028'
 'TCGA-06-0145' 'TCGA-06-0130' 'TCGA-06-0129' 'TCGA-06-0127'
 'TCGA-16-1045' 'TCGA-02-0114' 'TCGA-12-0615' 'TCGA-27-2526'
 'TCGA-06-0211' 'TCGA-15-1449' 'TCGA-06-0126' 'TCGA-06-6391'
 'TCGA-06-0189' 'TCGA-06-0147' 'TCGA-06-0125' 'TCGA-06-0132'
 'TCGA-14-1454' 'TCGA-02-0107' 'TCGA-32-2495' 'TCGA-06-0148'
 'TCGA-06-0139' 'TCGA-32-2498' 'TCGA-02-0058' 'TCGA-06-0119'
 'TCGA-12-0653' 'TCGA-06-0209' 'TCGA-28-2512' 'TCGA-02-0010'
 'TCGA-12-0662' 'TCGA-06-0159' 'TCGA-19-5953' 'TCGA-06-0176'
 'TCGA-02-0089' 'TCGA-16-1047' 'TCGA-02-0021' 'TCGA-06-0744'
 'TCGA-02-0023' 'TCGA-4W-AA9S' 'TCGA-06-2557' 'TCGA-12-0769'
 'TCGA-06-0237' 'TCGA-02-0001' 'TCGA-06-0121' 'TCGA-26-1442'
 'TCGA-14-0865' 'TCGA-19-5956' 'TCGA-06-6388' 'TCGA-19-2619'
 'TCGA-19-5958' 'TCGA-06

In [8]:
# column diagnoses 
print(clinical["diagnoses"][0])
print(clinical["diagnoses"][1])
print(clinical["diagnoses"][2])

[{'age_at_diagnosis': 18588, 'primary_diagnosis': 'Glioblastoma', 'morphology': '9440/3'}, {'days_to_last_follow_up': 754.0, 'age_at_diagnosis': 18160, 'primary_diagnosis': 'Glioblastoma', 'morphology': '9440/3', 'tumor_grade': None}]
[{'age_at_diagnosis': 19257, 'primary_diagnosis': 'Glioblastoma', 'morphology': '9440/3'}, {'days_to_last_follow_up': 256.0, 'age_at_diagnosis': 19108, 'primary_diagnosis': 'Glioblastoma', 'morphology': '9440/3', 'tumor_grade': None}]
[{'days_to_last_follow_up': None, 'age_at_diagnosis': None, 'primary_diagnosis': 'Glioblastoma', 'morphology': '9440/3', 'tumor_grade': 'Not Reported'}]


In [10]:
# column demographic race 
print(clinical["demographic.race"].unique())
print(clinical["demographic.race"].value_counts())

['black or african american' 'white' 'not reported' 'asian' nan]
demographic.race
white                        507
black or african american     51
not reported                  29
asian                         13
Name: count, dtype: int64


In [11]:
# column demographic gender
print(clinical["demographic.gender"].unique())
print(clinical["demographic.gender"].value_counts())

['female' 'not reported' 'male' nan]
demographic.gender
male            366
female          230
not reported      4
Name: count, dtype: int64


In [12]:
# column demographic ethnicity 
print(clinical["demographic.ethnicity"].unique())
print(clinical["demographic.ethnicity"].value_counts())

['not hispanic or latino' 'not reported' nan 'hispanic or latino']
demographic.ethnicity
not hispanic or latino    490
not reported               97
hispanic or latino         13
Name: count, dtype: int64
