In [1]:
import pandas as pd
import scipy.cluster.hierarchy
from scipy import stats
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, chisquare
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
import requests

### Dados Clínicos

In [2]:
# Ler ficheiro 'blca_msk_tcga_2020_clinical_data.tsv'
data = pd.read_csv('blca_msk_tcga_2020_clinical_data.tsv', sep='\t', header=0)
data.columns = data.columns.str.replace(' ', '_') # Substitui espaços por underscores
data = data.drop(columns=['Patient_ID', 'Study_ID', 'Cancer_Type', 'Cancer_Type_Detailed', 'Oncotree_Code']) # Remover colunas desnecessárias/redundantes
data.set_index('Sample_ID', inplace=True) # Definir coluna 'Sample_ID' como indice

data

Unnamed: 0_level_0,Diagnosis_Age,AJCC_Neoplasm_Disease_Stage,Analysis_Cohort,Aneuploidy_Score,Buffa_Hypoxia_Score,Cohort,Disease_Free_(Months),Disease_Free_Status,Distant_Metastasis,Ethnicity_Category,...,Race_Category,Radiation_Therapy,Ragnum_Hypoxia_Score,Number_of_Samples_Per_Patient,Sample_Type,Sex,Somatic_Status,TMB_(nonsynonymous),Tumor_Stage,Winter_Hypoxia_Score
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DS-bla-112,60.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,4.646910,T3,
DS-bla-113,82.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Female,Matched,18.587641,T3,
DS-bla-125,81.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,5.808638,T3,
DS-bla-126,82.0,,Discovery,,,Eur Urol 2013 BCa,,,,,...,,,,1,,Male,Matched,8.132093,T3,
DS-tur-003-P1,57.0,,Discovery,,,Progression Series,,,,,...,,,,1,,Male,Matched,0.333333,T4,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,79.0,STAGE III,Discovery,12.0,35.0,TCGA,,,Mx,Not Hispanic Or Latino,...,White,No,26.0,1,Primary,Female,Matched,3.433333,T4,44.0
TCGA-ZF-AA58-01,61.0,STAGE IV,Discovery,10.0,43.0,TCGA,,,Mx,Not Hispanic Or Latino,...,White,No,18.0,1,Primary,Female,Matched,6.866667,T3,48.0
TCGA-ZF-AA5H-01,60.0,STAGE IV,Discovery,20.0,45.0,TCGA,,,,Not Hispanic Or Latino,...,White,No,20.0,1,Primary,Female,Matched,1.866667,T3,56.0
TCGA-ZF-AA5N-01,62.0,STAGE IV,Discovery,3.0,39.0,TCGA,,,Yes,Not Hispanic Or Latino,...,White,No,10.0,1,Primary,Female,Matched,1.700000,T2,40.0


In [3]:
# Identify categorical and numerical columns
categorical_columns = data.select_dtypes(include=['object']).columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

# Impute NaN values in categorical columns with "Unknown"
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])

# Impute NaN values in numerical columns with the median
numerical_imputer = SimpleImputer(strategy='median')
data[numerical_columns] = numerical_imputer.fit_transform(data[numerical_columns])

data


Unnamed: 0_level_0,Diagnosis_Age,AJCC_Neoplasm_Disease_Stage,Analysis_Cohort,Aneuploidy_Score,Buffa_Hypoxia_Score,Cohort,Disease_Free_(Months),Disease_Free_Status,Distant_Metastasis,Ethnicity_Category,...,Race_Category,Radiation_Therapy,Ragnum_Hypoxia_Score,Number_of_Samples_Per_Patient,Sample_Type,Sex,Somatic_Status,TMB_(nonsynonymous),Tumor_Stage,Winter_Hypoxia_Score
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DS-bla-112,60.0,Unknown,Discovery,14.0,13.0,Eur Urol 2013 BCa,20.054575,Unknown,Unknown,Unknown,...,Unknown,Unknown,13.0,1.0,Unknown,Male,Matched,4.646910,T3,18.0
DS-bla-113,82.0,Unknown,Discovery,14.0,13.0,Eur Urol 2013 BCa,20.054575,Unknown,Unknown,Unknown,...,Unknown,Unknown,13.0,1.0,Unknown,Female,Matched,18.587641,T3,18.0
DS-bla-125,81.0,Unknown,Discovery,14.0,13.0,Eur Urol 2013 BCa,20.054575,Unknown,Unknown,Unknown,...,Unknown,Unknown,13.0,1.0,Unknown,Male,Matched,5.808638,T3,18.0
DS-bla-126,82.0,Unknown,Discovery,14.0,13.0,Eur Urol 2013 BCa,20.054575,Unknown,Unknown,Unknown,...,Unknown,Unknown,13.0,1.0,Unknown,Male,Matched,8.132093,T3,18.0
DS-tur-003-P1,57.0,Unknown,Discovery,14.0,13.0,Progression Series,20.054575,Unknown,Unknown,Unknown,...,Unknown,Unknown,13.0,1.0,Unknown,Male,Matched,0.333333,T4,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-ZF-AA56-01,79.0,STAGE III,Discovery,12.0,35.0,TCGA,20.054575,Unknown,Mx,Not Hispanic Or Latino,...,White,No,26.0,1.0,Primary,Female,Matched,3.433333,T4,44.0
TCGA-ZF-AA58-01,61.0,STAGE IV,Discovery,10.0,43.0,TCGA,20.054575,Unknown,Mx,Not Hispanic Or Latino,...,White,No,18.0,1.0,Primary,Female,Matched,6.866667,T3,48.0
TCGA-ZF-AA5H-01,60.0,STAGE IV,Discovery,20.0,45.0,TCGA,20.054575,Unknown,Unknown,Not Hispanic Or Latino,...,White,No,20.0,1.0,Primary,Female,Matched,1.866667,T3,56.0
TCGA-ZF-AA5N-01,62.0,STAGE IV,Discovery,3.0,39.0,TCGA,20.054575,Unknown,Yes,Not Hispanic Or Latino,...,White,No,10.0,1.0,Primary,Female,Matched,1.700000,T2,40.0


In [4]:
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Print values and their occurrences for each categorical column
for col in categorical_columns:
    print(f"Value counts for column: {col}")
    print(data[col].value_counts())
    print("\n")

Value counts for column: AJCC_Neoplasm_Disease_Stage
AJCC_Neoplasm_Disease_Stage
Unknown      177
STAGE III    103
STAGE IV     100
STAGE II      96
Name: count, dtype: int64


Value counts for column: Analysis_Cohort
Analysis_Cohort
Discovery     384
Validation     92
Name: count, dtype: int64


Value counts for column: Cohort
Cohort
TCGA                      300
Chemo NaÃ¯ve 12245 DMP     92
XRT MSK                    42
ERCC2 Platinum             34
Eur Urol 2013 BCa           4
Progression Series          2
Oscar Lin                   2
Name: count, dtype: int64


Value counts for column: Disease_Free_Status
Disease_Free_Status
Unknown                  329
0:DiseaseFree            121
1:Recurred/Progressed     26
Name: count, dtype: int64


Value counts for column: Distant_Metastasis
Distant_Metastasis
Unknown    280
Mx         147
Yes         49
Name: count, dtype: int64


Value counts for column: Ethnicity_Category
Ethnicity_Category
Not Hispanic Or Latino                    274


In [5]:
data.isna().sum() # Check if any of the columns have missing values

Diagnosis_Age                             0
AJCC_Neoplasm_Disease_Stage               0
Analysis_Cohort                           0
Aneuploidy_Score                          0
Buffa_Hypoxia_Score                       0
Cohort                                    0
Disease_Free_(Months)                     0
Disease_Free_Status                       0
Distant_Metastasis                        0
Ethnicity_Category                        0
Fraction_Genome_Altered                   0
Neoplasm_Histologic_Grade                 0
Lymph_Node_Status                         0
MSI_MANTIS_Score                          0
MSIsensor_Score                           0
Mutation_Count                            0
Overall_Survival_(Months)                 0
Overall_Survival_Status                   0
AJCC_Metastasis_Stage_Code                0
AJCC_Neoplasm_Disease_Lymph_Node_Stage    0
AJCC_Tumor_Stage_Code                     0
Progress_Free_Survival_(Months)           0
Progression_Free_Status         

In [6]:
data.info() # Check column types

<class 'pandas.core.frame.DataFrame'>
Index: 476 entries, DS-bla-112 to TCGA-ZF-AA5P-01
Data columns (total 37 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Diagnosis_Age                           476 non-null    float64
 1   AJCC_Neoplasm_Disease_Stage             476 non-null    object 
 2   Analysis_Cohort                         476 non-null    object 
 3   Aneuploidy_Score                        476 non-null    float64
 4   Buffa_Hypoxia_Score                     476 non-null    float64
 5   Cohort                                  476 non-null    object 
 6   Disease_Free_(Months)                   476 non-null    float64
 7   Disease_Free_Status                     476 non-null    object 
 8   Distant_Metastasis                      476 non-null    object 
 9   Ethnicity_Category                      476 non-null    object 
 10  Fraction_Genome_Altered                 476 no

In [7]:
data.describe() # Check statistics

Unnamed: 0,Diagnosis_Age,Aneuploidy_Score,Buffa_Hypoxia_Score,Disease_Free_(Months),Fraction_Genome_Altered,MSI_MANTIS_Score,MSIsensor_Score,Mutation_Count,Overall_Survival_(Months),Progress_Free_Survival_(Months),Ragnum_Hypoxia_Score,Number_of_Samples_Per_Patient,TMB_(nonsynonymous),Winter_Hypoxia_Score
count,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0,476.0
mean,67.382439,13.722689,13.642857,23.99953,0.262737,0.324647,0.248992,181.403361,27.87603,21.355608,11.52521,1.0,9.070931,19.537815
std,10.724791,6.258871,14.259496,18.331331,0.188967,0.014189,1.137953,281.520236,26.391757,22.629639,8.272809,0.0,12.263071,19.555175
min,31.210959,0.0,-39.0,0.0,0.0,0.2236,0.0,1.0,0.0,0.0,-20.0,1.0,0.0,-38.0
25%,60.0,11.0,9.0,20.054575,0.096675,0.321775,0.0,22.0,12.098498,12.024526,10.0,1.0,2.8,11.5
50%,67.0,14.0,13.0,20.054575,0.24295,0.32545,0.05,107.0,18.575139,15.336818,13.0,1.0,5.872318,18.0
75%,76.0,16.0,19.0,20.054575,0.3944,0.329,0.16,227.0,34.83311,18.813492,16.0,1.0,10.765917,24.0
max,90.0,34.0,49.0,163.296841,0.9455,0.4143,22.7,3548.0,166.025578,163.296841,28.0,1.0,134.207791,76.0


### API cBioPortal

In [8]:
# Base URL da API do cBioPortal
BASE_URL = "https://www.cbioportal.org/api"

# Função para obter metadados clínicos de um estudo específico
def get_clinical_metadata(study_id):
    endpoint = f"{BASE_URL}/studies/{study_id}/clinical-attributes"
    response = requests.get(endpoint)
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()

In [9]:
# Especificar o ID do estudo
study_id = 'blca_msk_tcga_2020'

# Obter metadados clínicos para o estudo especificado
clinical_metadata = get_clinical_metadata(study_id)
print("Número de atributos clínicos:", len(clinical_metadata))

# Converter metadados clínicos em um DataFrame do pandas
metadata_df = pd.DataFrame(clinical_metadata)

# Exibir os primeiros registros dos metadados
print("Metadados clínicos:")
print(metadata_df.head())

# Exibir os nomes dos atributos clínicos
print("Lista de atributos clínicos:")
print(metadata_df['clinicalAttributeId'].tolist())

Número de atributos clínicos: 40
Metadados clínicos:
                   displayName  \
0                Diagnosis Age   
1  AJCC Neoplasm Disease Stage   
2              Analysis Cohort   
3             Aneuploidy Score   
4          Buffa Hypoxia Score   

                                         description datatype  \
0  Age at which a condition or disease was first ...   NUMBER   
1  The extent of a cancer, especially whether the...   STRING   
2                                    Analysis Cohort   STRING   
3                                   Aneuploidy Score   NUMBER   
4  Hypoxia scores based on the Buffa mRNA abundan...   NUMBER   

   patientAttribute priority          clinicalAttributeId             studyId  
0              True        1                          AGE  blca_msk_tcga_2020  
1              True        1  AJCC_PATHOLOGIC_TUMOR_STAGE  blca_msk_tcga_2020  
2             False        1              ANALYSIS_COHORT  blca_msk_tcga_2020  
3             False        1   

In [10]:
# Ler o ficheiro data_mrna_seq_v2_rsem.txt
data_mrnaseq = pd.read_csv('blca_msk_tcga_2020/data_mrna_seq_v2_rsem.txt', delimiter='\t')
data_mrnaseq

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,TCGA-2F-A9KP-01,TCGA-2F-A9KQ-01,TCGA-2F-A9KR-01,TCGA-2F-A9KT-01,TCGA-2F-A9KW-01,TCGA-4Z-AA7O-01,TCGA-4Z-AA7Q-01,TCGA-4Z-AA7S-01,...,TCGA-ZF-AA4X-01,TCGA-ZF-AA51-01,TCGA-ZF-AA52-01,TCGA-ZF-AA53-01,TCGA-ZF-AA54-01,TCGA-ZF-AA56-01,TCGA-ZF-AA58-01,TCGA-ZF-AA5H-01,TCGA-ZF-AA5N-01,TCGA-ZF-AA5P-01
0,,100130426,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.4411,0.4216,0.0000,1.2270,0.0000
1,,100133144,16.1382,13.3333,15.3523,14.0136,7.1159,16.6052,0.0000,23.2738,...,24.0360,0.0000,4.2992,0.0000,4.6306,5.8271,1.7032,3.3904,0.0000,7.0317
2,UBE2Q2P2,100134869,12.5759,10.7742,42.5810,17.6427,2.3091,8.6650,11.9205,13.7922,...,10.3812,5.2751,14.5465,6.8871,8.0213,7.8474,2.0911,1.1387,0.0000,4.0856
3,HMGB1P1,10357,137.8860,104.6780,146.4530,142.9620,176.8770,179.1990,96.9139,131.3010,...,58.4972,119.0880,91.4959,163.8500,83.2794,185.6730,210.1810,186.9380,155.3010,114.1130
4,,10431,882.2310,954.1030,487.2100,954.7770,1586.0300,768.7450,662.2520,892.3300,...,841.2210,859.0810,753.8280,1646.6900,1702.4300,575.2100,1595.7000,1100.5700,1123.9300,778.2100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20526,ZYG11A,440590,17.0620,1.8544,1.8995,6.7835,51.3143,0.0000,15.2318,3.6608,...,23.6118,11.3037,1.7668,36.5014,94.1296,2.2056,143.3390,0.3235,15.9509,13.3407
20527,ZYG11B,79699,1526.4300,1571.1600,748.8600,878.4620,1493.8700,870.4900,615.8940,1285.4100,...,1311.4600,1003.7700,624.2640,431.1300,523.2790,697.8390,711.2140,671.9240,358.2820,887.7150
20528,ZYX,7791,3105.2900,3275.3800,4465.6200,3449.4100,2778.3000,4226.7700,5065.5600,3836.5600,...,2098.6500,2724.9400,6375.7400,11268.6000,7956.4800,5666.0800,9911.0500,9212.5000,6306.7500,14836.0000
20529,ZZEF1,23140,1467.7500,750.5800,910.7880,850.1980,2811.2900,712.8850,938.4110,842.4510,...,1735.2700,666.1640,724.3820,337.4660,1099.7000,752.9770,745.3630,911.3190,850.3070,1030.0200
