In [1]:
import pandas as pd
import scipy.cluster.hierarchy
from scipy import stats
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, chisquare
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
import requests

### Dados Clínicos

In [2]:
# Ler ficheiro 'blca_msk_tcga_2020_clinical_data.tsv'
data = pd.read_csv('blca_msk_tcga_2020_clinical_data.tsv', sep='\t', header=0)
data.columns = data.columns.str.replace(' ', '_')                                                               # Substitui espaços por underscores
data = data.drop(columns=['Patient_ID', 'Study_ID', 'Cancer_Type', 'Cancer_Type_Detailed', 'Oncotree_Code'])    # Remover colunas desnecessárias/redundantes
data.set_index('Sample_ID', inplace=True)                                                                       # Definir coluna 'Sample_ID' como indice
data

Unnamed: 0_level_0,Diagnosis_Age,AJCC_Neoplasm_Disease_Stage,Analysis_Cohort,Aneuploidy_Score,Buffa_Hypoxia_Score,Cohort,Disease_Free_(Months),Disease_Free_Status,Distant_Metastasis,Ethnicity_Category,...,Race_Category,Radiation_Therapy,Ragnum_Hypoxia_Score,Number_of_Samples_Per_Patient,Sample_Type,Sex,Somatic_Status,TMB_(nonsynonymous),Tumor_Stage,Winter_Hypoxia_Score
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DS-bla-112,60.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,4.646910,T3,
DS-bla-113,82.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Female,Matched,18.587641,T3,
DS-bla-125,81.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,5.808638,T3,
DS-bla-126,82.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,8.132093,T3,
DS-tur-003-P1,57.0,,Discovery,,,Progression Series,,,,,...,,,,1,,Male,Matched,0.333333,T4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,79.0,STAGE III,Discovery,12.0,35.0,TCGA,,,Mx,Not Hispanic Or Latino,...,White,No,26.0,1,Primary,Female,Matched,3.433333,T4,44.0
TCGA-ZF-AA58-01,61.0,STAGE IV,Discovery,10.0,43.0,TCGA,,,Mx,Not Hispanic Or Latino,...,White,No,18.0,1,Primary,Female,Matched,6.866667,T3,48.0
TCGA-ZF-AA5H-01,60.0,STAGE IV,Discovery,20.0,45.0,TCGA,,,,Not Hispanic Or Latino,...,White,No,20.0,1,Primary,Female,Matched,1.866667,T3,56.0
TCGA-ZF-AA5N-01,62.0,STAGE IV,Discovery,3.0,39.0,TCGA,,,Yes,Not Hispanic Or Latino,...,White,No,10.0,1,Primary,Female,Matched,1.700000,T2,40.0


In [3]:
# Identificar colunas categóricas
categorical_columns = data.select_dtypes(include=['object']).columns

# Identificar colunas numéricas
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Imputar valores NaN nas colunas categóricas com "Unknown"
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])

# Imputar valores NaN nas colunas numéricas com a mediana
numerical_imputer = SimpleImputer(strategy='median')
data[numerical_columns] = numerical_imputer.fit_transform(data[numerical_columns])

# Verificar o DataFrame atualizado
print(data)

                 Diagnosis_Age AJCC_Neoplasm_Disease_Stage Analysis_Cohort  \
Sample_ID                                                                    
DS-bla-112                60.0                     Unknown       Discovery   
DS-bla-113                82.0                     Unknown       Discovery   
DS-bla-125                81.0                     Unknown       Discovery   
DS-bla-126                82.0                     Unknown       Discovery   
DS-tur-003-P1             57.0                     Unknown       Discovery   
...                        ...                         ...             ...   
TCGA-ZF-AA56-01           79.0                   STAGE III       Discovery   
TCGA-ZF-AA58-01           61.0                    STAGE IV       Discovery   
TCGA-ZF-AA5H-01           60.0                    STAGE IV       Discovery   
TCGA-ZF-AA5N-01           62.0                    STAGE IV       Discovery   
TCGA-ZF-AA5P-01           65.0                    STAGE IV      

In [4]:
# Identificar colunas categóricas
categorical_columns = data.select_dtypes(include=['object']).columns

# Imprimir os valores e suas ocorrências para cada coluna categórica
for col in categorical_columns:
    print(f"Contagem de valores para a coluna: {col}")
    print(data[col].value_counts())
    print("\n")

Contagem de valores para a coluna: AJCC_Neoplasm_Disease_Stage
AJCC_Neoplasm_Disease_Stage
Unknown      177
STAGE III    103
STAGE IV     100
STAGE II      96
Name: count, dtype: int64


Contagem de valores para a coluna: Analysis_Cohort
Analysis_Cohort
Discovery     384
Validation     92
Name: count, dtype: int64


Contagem de valores para a coluna: Cohort
Cohort
TCGA                      300
Chemo NaÃ¯ve 12245 DMP     92
XRT MSK                    42
ERCC2 Platinum             34
Eur Urol 2013 BCa           4
Progression Series          2
Oscar Lin                   2
Name: count, dtype: int64


Contagem de valores para a coluna: Disease_Free_Status
Disease_Free_Status
Unknown                  329
0:DiseaseFree            121
1:Recurred/Progressed     26
Name: count, dtype: int64


Contagem de valores para a coluna: Distant_Metastasis
Distant_Metastasis
Unknown    280
Mx         147
Yes         49
Name: count, dtype: int64


Contagem de valores para a coluna: Ethnicity_Category
Ethni

In [6]:
data.isna().sum() # Verificar se alguma das colunas tem valores ausentes

Diagnosis_Age                             0
AJCC_Neoplasm_Disease_Stage               0
Analysis_Cohort                           0
Aneuploidy_Score                          0
Buffa_Hypoxia_Score                       0
Cohort                                    0
Disease_Free_(Months)                     0
Disease_Free_Status                       0
Distant_Metastasis                        0
Ethnicity_Category                        0
Fraction_Genome_Altered                   0
Neoplasm_Histologic_Grade                 0
Lymph_Node_Status                         0
MSI_MANTIS_Score                          0
MSIsensor_Score                           0
Mutation_Count                            0
Overall_Survival_(Months)                 0
Overall_Survival_Status                   0
AJCC_Metastasis_Stage_Code                0
AJCC_Neoplasm_Disease_Lymph_Node_Stage    0
AJCC_Tumor_Stage_Code                     0
Progress_Free_Survival_(Months)           0
Progression_Free_Status         

In [7]:
data.info() # Verificar tipos de colunas e outras informações gerais sobre o DataFrame

<class 'pandas.core.frame.DataFrame'>
Index: 476 entries, DS-bla-112 to TCGA-ZF-AA5P-01
Data columns (total 37 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Diagnosis_Age                           476 non-null    float64
 1   AJCC_Neoplasm_Disease_Stage             476 non-null    object 
 2   Analysis_Cohort                         476 non-null    object 
 3   Aneuploidy_Score                        476 non-null    float64
 4   Buffa_Hypoxia_Score                     476 non-null    float64
 5   Cohort                                  476 non-null    object 
 6   Disease_Free_(Months)                   476 non-null    float64
 7   Disease_Free_Status                     476 non-null    object 
 8   Distant_Metastasis                      476 non-null    object 
 9   Ethnicity_Category                      476 non-null    object 
 10  Fraction_Genome_Altered                 476 no

In [8]:
data.describe() # Verificar as estatísticas descritivas das colunas numéricas

Unnamed: 0,Diagnosis_Age,Aneuploidy_Score,Buffa_Hypoxia_Score,Disease_Free_(Months),Fraction_Genome_Altered,MSI_MANTIS_Score,MSIsensor_Score,Mutation_Count,Overall_Survival_(Months),Progress_Free_Survival_(Months),Ragnum_Hypoxia_Score,Number_of_Samples_Per_Patient,TMB_(nonsynonymous),Winter_Hypoxia_Score
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,67.382439,13.722689,13.642857,23.99953,0.262737,0.324647,0.248992,181.403361,27.87603,21.355608,11.52521,1.0,9.070931,19.537815
std,10.724791,6.258871,14.259496,18.331331,0.188967,0.014189,1.137953,281.520236,26.391757,22.629639,8.272809,0.0,12.263071,19.555175
min,31.210959,0.0,-39.0,0.0,0.0,0.2236,0.0,1.0,0.0,0.0,-20.0,1.0,0.0,-38.0
25%,60.0,11.0,9.0,20.054575,0.096675,0.321775,0.0,22.0,12.098498,12.024526,10.0,1.0,2.8,11.5
50%,67.0,14.0,13.0,20.054575,0.24295,0.32545,0.05,107.0,18.575139,15.336818,13.0,1.0,5.872318,18.0
75%,76.0,16.0,19.0,20.054575,0.3944,0.329,0.16,227.0,34.83311,18.813492,16.0,1.0,10.765917,24.0
max,90.0,34.0,49.0,163.296841,0.9455,0.4143,22.7,3548.0,166.025578,163.296841,28.0,1.0,134.207791,76.0


### API cBioPortal

In [9]:
# Base URL da API do cBioPortal
BASE_URL = "https://www.cbioportal.org/api"

# Função para obter metadados clínicos de um estudo específico
def get_clinical_metadata(study_id):
    endpoint = f"{BASE_URL}/studies/{study_id}/clinical-attributes"
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()

In [10]:
# Especificar o ID do estudo
study_id = 'blca_msk_tcga_2020'

# Obter metadados clínicos para o estudo especificado
clinical_metadata = get_clinical_metadata(study_id)
print("Número de atributos clínicos:", len(clinical_metadata))

# Converter metadados clínicos em um DataFrame do pandas
metadata_df = pd.DataFrame(clinical_metadata)

# Exibir os primeiros registros dos metadados
print("Metadados clínicos:")
print(metadata_df.head())

# Exibir os nomes dos atributos clínicos
print("Lista de atributos clínicos:")
print(metadata_df['clinicalAttributeId'].tolist())

Número de atributos clínicos: 40
Metadados clínicos:
                   displayName  \
0                Diagnosis Age   
1  AJCC Neoplasm Disease Stage   
2              Analysis Cohort   
3             Aneuploidy Score   
4          Buffa Hypoxia Score   

                                         description datatype  \
0  Age at which a condition or disease was first ...   NUMBER   
1  The extent of a cancer, especially whether the...   STRING   
2                                    Analysis Cohort   STRING   
3                                   Aneuploidy Score   NUMBER   
4  Hypoxia scores based on the Buffa mRNA abundan...   NUMBER   

   patientAttribute priority          clinicalAttributeId             studyId  
0              True        1                          AGE  blca_msk_tcga_2020  
1              True        1  AJCC_PATHOLOGIC_TUMOR_STAGE  blca_msk_tcga_2020  
2             False        1              ANALYSIS_COHORT  blca_msk_tcga_2020  
3             False        1   

In [11]:
# Iterar a descrição de cada atributo clínico (colunas do dataset)
for index, row in metadata_df.iterrows():
    clinical_attribute_id = row['clinicalAttributeId']
    description = row['description']
    print(f"{clinical_attribute_id.ljust(metadata_df['clinicalAttributeId'].str.len().max())} \t {description}")

AGE                             	 Age at which a condition or disease was first diagnosed.
AJCC_PATHOLOGIC_TUMOR_STAGE     	 The extent of a cancer, especially whether the disease has spread from the original site to other parts of the body based on AJCC staging criteria.
ANALYSIS_COHORT                 	 Analysis Cohort
ANEUPLOIDY_SCORE                	 Aneuploidy Score
BUFFA_HYPOXIA_SCORE             	 Hypoxia scores based on the Buffa mRNA abundance signatures
CANCER_TYPE                     	 Cancer Type
CANCER_TYPE_DETAILED            	 Cancer Type Detailed
COHORT                          	 Cohort
DFS_MONTHS                      	 Disease free (months) since initial treatment.
DFS_STATUS                      	 Disease free status since initial treatment.
DISTANT_METS                    	 Distant Metastasis
ETHNICITY                       	 The text for reporting information about ethnicity.
FRACTION_GENOME_ALTERED         	 Fraction Genome Altered
GRADE                           	

## mRNA Seq

In [12]:
# Ler o ficheiro data_mrna_seq_v2_rsem.txt
data_mrnaseq = pd.read_csv('blca_msk_tcga_2020/data_mrna_seq_v2_rsem.txt', delimiter='\t')
data_mrnaseq = data_mrnaseq.dropna(subset=['Hugo_Symbol'])      # Remover linhas onde valor na coluna 'Hugo_Symbol' é NaN
data_mrnaseq = data_mrnaseq.drop(columns=['Entrez_Gene_Id'])    # Remover coluna 'Entrez_Gene_Id'
data_mrnaseq.set_index('Hugo_Symbol', inplace=True)             # Definir 'Hugo_Symbol' como indice
data_mrnaseq

Unnamed: 0_level_0,TCGA-2F-A9KP-01,TCGA-2F-A9KQ-01,TCGA-2F-A9KR-01,TCGA-2F-A9KT-01,TCGA-2F-A9KW-01,TCGA-4Z-AA7O-01,TCGA-4Z-AA7Q-01,TCGA-4Z-AA7S-01,TCGA-4Z-AA7Y-01,TCGA-4Z-AA80-01,...,TCGA-ZF-AA4X-01,TCGA-ZF-AA51-01,TCGA-ZF-AA52-01,TCGA-ZF-AA53-01,TCGA-ZF-AA54-01,TCGA-ZF-AA56-01,TCGA-ZF-AA58-01,TCGA-ZF-AA5H-01,TCGA-ZF-AA5N-01,TCGA-ZF-AA5P-01
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBE2Q2P2,12.5759,10.7742,42.5810,17.6427,2.3091,8.665,11.9205,13.7922,17.5126,14.1017,...,10.3812,5.2751,14.5465,6.8871,8.0213,7.8474,2.0911,1.1387,0.0000,4.0856
HMGB1P1,137.8860,104.6780,146.4530,142.9620,176.8770,179.199,96.9139,131.3010,175.6810,167.6800,...,58.4972,119.0880,91.4959,163.8500,83.2794,185.6730,210.1810,186.9380,155.3010,114.1130
RNU12-2P,0.8323,0.0000,2.3743,0.0000,0.0000,0.000,0.0000,0.4576,0.0000,0.0000,...,2.0010,0.0000,0.0000,0.0000,0.5061,0.0000,0.0000,0.0000,1.2270,0.5559
SSX9P,0.0000,0.0000,0.0000,0.0000,0.0000,0.000,7.2848,0.0000,0.0000,7.7482,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
EZHIP,0.0000,0.0000,1.8995,5.0876,0.5236,0.000,19.2053,0.0000,0.0000,2.4213,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.4216,0.6470,0.0000,4.4469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,17.0620,1.8544,1.8995,6.7835,51.3143,0.000,15.2318,3.6608,11.0142,5.8111,...,23.6118,11.3037,1.7668,36.5014,94.1296,2.2056,143.3390,0.3235,15.9509,13.3407
ZYG11B,1526.4300,1571.1600,748.8600,878.4620,1493.8700,870.490,615.8940,1285.4100,967.8750,876.0290,...,1311.4600,1003.7700,624.2640,431.1300,523.2790,697.8390,711.2140,671.9240,358.2820,887.7150
ZYX,3105.2900,3275.3800,4465.6200,3449.4100,2778.3000,4226.770,5065.5600,3836.5600,3156.9500,2389.8300,...,2098.6500,2724.9400,6375.7400,11268.6000,7956.4800,5666.0800,9911.0500,9212.5000,6306.7500,14836.0000
ZZEF1,1467.7500,750.5800,910.7880,850.1980,2811.2900,712.885,938.4110,842.4510,1086.7400,448.4260,...,1735.2700,666.1640,724.3820,337.4660,1099.7000,752.9770,745.3630,911.3190,850.3070,1030.0200


In [13]:
data_mrnaseq.rename_axis('', inplace=True)  # Remove o nome 'Hugo_Symbol' do índice
data_mrnaseq = data_mrnaseq.transpose()     # Faz a transposta do dataframe (NOTA: correr este código novamente reverte a transposta, se se enganar no código, corre a leitura do 'data_mrnaseq' primeiro!!!)
data_mrnaseq

Unnamed: 0,UBE2Q2P2,HMGB1P1,RNU12-2P,SSX9P,EZHIP,EFCAB8,SRP14P1,TRIM75P,SPATA31B1P,REXO1L6P,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-2F-A9KP-01,12.5759,137.886,0.8323,0.0,0.0000,4.5776,2.0807,0.4161,0.0,0.0,...,718.119,969.621,37.0370,316.688,1420.720,17.0620,1526.430,3105.29,1467.750,860.591
TCGA-2F-A9KQ-01,10.7742,104.678,0.0000,0.0,0.0000,0.4636,2.3180,0.0000,0.0,0.0,...,532.536,795.086,60.2689,378.303,873.435,1.8544,1571.160,3275.38,750.580,621.233
TCGA-2F-A9KR-01,42.5810,146.453,2.3743,0.0,1.8995,1.4246,3.7989,0.0000,0.0,0.0,...,616.848,568.412,45.5869,470.115,1924.150,1.8995,748.860,4465.62,910.788,935.481
TCGA-2F-A9KT-01,17.6427,142.962,0.0000,0.0,5.0876,2.2612,6.2182,0.0000,0.0,0.0,...,726.625,1087.050,17.5240,281.515,938.383,6.7835,878.462,3449.41,850.198,630.865
TCGA-2F-A9KW-01,2.3091,176.877,0.0000,0.0,0.5236,1.5708,5.7598,0.0000,0.0,0.0,...,421.714,2055.190,25.6571,244.005,1160.330,51.3143,1493.870,2778.30,2811.290,746.151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,7.8474,185.673,0.0000,0.0,0.0000,0.0000,3.0878,0.0000,0.0,0.0,...,551.566,1201.150,22.4967,220.997,745.038,2.2056,697.839,5666.08,752.977,619.321
TCGA-ZF-AA58-01,2.0911,210.181,0.0000,0.0,0.4216,1.2648,1.2648,0.0000,0.0,0.0,...,532.618,1059.440,37.5211,254.637,1037.100,143.3390,711.214,9911.05,745.363,548.904
TCGA-ZF-AA5H-01,1.1387,186.938,0.0000,0.0,0.6470,14.8813,2.9116,0.0000,0.0,0.0,...,520.105,1248.410,19.0869,186.663,498.200,0.3235,671.924,9212.50,911.319,1053.660
TCGA-ZF-AA5N-01,0.0000,155.301,1.2270,0.0,0.0000,2.4540,1.2270,0.0000,0.0,0.0,...,496.638,641.718,13.4969,121.472,534.969,15.9509,358.282,6306.75,850.307,380.368
