O notebook abaixo se refere a análise dos clusters gerado pelo K-means. Os datasets utilizados foram gerados pelos outros notebooks responsáveis pela extração dos dados e geração dos modelos. 

In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
from missforest import MissForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.manifold import trustworthiness
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture
import scipy.stats as stats


Os dados "worst_exams_24h" e "sofa_worst_24h" podem ser gerados a partir do notebook extract_exams_24h. Os dados "admissions" e "patients" são bases originais da MIMIC-IV. Por fim, "final_clusters" pode ser gerado pelo notebook final_model.

In [40]:
worst_exams_24h = pd.read_csv("../data/worst/worst_exams_24h_.csv")
sofa_worst_24h= pd.read_csv("../data/worst/sofa_worst_24h.csv")
admissions = pd.read_csv('/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/admissions.csv')
patients = pd.read_csv('/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/patients.csv')
final_clusters = pd.read_csv("final_clusters.csv")



#### Definindo pacientes

In [41]:
sepsis_patients = sofa_worst_24h[['subject_id','hadm_id']]

#### Análise da mortalidade

In [42]:
death_patients  = pd.merge(sepsis_patients,admissions,on=['subject_id','hadm_id'],how = 'left')

In [43]:
death_patients['died'] = death_patients['discharge_location'].apply(lambda x: 1 if x == "DIED" else 0)

In [44]:
death_patients = death_patients[['subject_id','hadm_id','died','deathtime']]

In [45]:
death_patients['Cluster'] = final_clusters['Cluster']

In [46]:
cluster_deaths = death_patients.groupby('Cluster')['died'].sum().reset_index()

# Renomeando a coluna para melhor entendimento
cluster_deaths.rename(columns={'died': 'Total Deaths'}, inplace=True)

# Exibindo o resultado
cluster_deaths

Unnamed: 0,Cluster,Total Deaths
0,0,1404
1,1,694
2,2,1346
3,3,661
4,4,765


In [47]:
# Contando o número total de pacientes em cada cluster
cluster_counts = death_patients.groupby('Cluster').size().reset_index(name='Total Patients')

# Calculando a taxa de mortalidade
cluster_deaths['Total Patients'] = cluster_counts['Total Patients']
cluster_deaths['Mortality Rate (%)'] = (cluster_deaths['Total Deaths'] / cluster_deaths['Total Patients']) * 100

# Exibindo o resultado
cluster_deaths


Unnamed: 0,Cluster,Total Deaths,Total Patients,Mortality Rate (%)
0,0,1404,5725,24.524017
1,1,694,10401,6.672435
2,2,1346,11008,12.227471
3,3,661,5681,11.635275
4,4,765,3766,20.31333


In [48]:
contingency_table = pd.crosstab(death_patients['Cluster'], death_patients['died'])

# Realizando o teste qui-quadrado
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Exibindo os resultados
print(f"Estatística Qui-Quadrado: {chi2}")
print(f"p-valor: {p_value}")

# Avaliando o resultado
alpha = 0.05
if p_value < alpha:
    print("Rejeitamos a hipótese nula. Há uma diferença significativa nas taxas de mortalidade entre os clusters.")
else:
    print("Falhamos em rejeitar a hipótese nula. Não há diferença significativa nas taxas de mortalidade entre os clusters.")

Estatística Qui-Quadrado: 1205.9399965534992
p-valor: 8.212442864730991e-260
Rejeitamos a hipótese nula. Há uma diferença significativa nas taxas de mortalidade entre os clusters.


#### Tempo na uti

In [49]:
icustays = pd.read_csv('/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/icu/icustays.csv')

In [50]:
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

# Calculando o tempo de UTI
icustays['uti_time'] = (icustays['outtime'] - icustays['intime']).dt.total_seconds() / 3600


In [51]:
uti_time = icustays[['subject_id','hadm_id','uti_time']]


In [52]:
uti_time = uti_time.groupby(['subject_id', 'hadm_id'], as_index=False)['uti_time'].sum()

In [53]:
uti_time  = pd.merge(sepsis_patients,uti_time,on=['subject_id','hadm_id'],how = 'left')

In [54]:
uti_time = uti_time.fillna(0)

In [55]:
uti_time['Cluster'] = final_clusters['Cluster']

In [56]:
def calculate_quantiles(group):
    q25 = group.quantile(0.25)
    q75 = group.quantile(0.75)
    return pd.Series({'Q25': q25, 'Q75': q75})

# Calcular Q25 e Q75
quantiles_df = uti_time.groupby('Cluster')['uti_time'].apply(calculate_quantiles).reset_index()

# Organizar as colunas
quantiles_df = quantiles_df.rename(columns={'uti_time': 'Q25', 'Q75': 'Q75'})
quantiles_df

Unnamed: 0,Cluster,level_1,Q25
0,0,Q25,38.872222
1,0,Q75,164.649722
2,1,Q25,17.235278
3,1,Q75,113.798056
4,2,Q25,21.153889
5,2,Q75,113.368958
6,3,Q25,33.611667
7,3,Q75,158.710278
8,4,Q25,47.673194
9,4,Q75,220.677639


In [57]:
print(cluster_stats.to_markdown())

NameError: name 'cluster_stats' is not defined

In [None]:
anova_result = stats.f_oneway(*(uti_time[uti_time['Cluster'] == cluster]['uti_time'] for cluster in uti_time['Cluster'].unique()))

# Exibindo o p-valor
print(f"ANOVA p-valor: {anova_result.pvalue}")

ANOVA p-valor: 4.481993416013831e-198


#### Análise do diagnóstico

In [None]:
diagnostics = pd.read_csv("../data/diagnostics.csv")

In [None]:
diagnostics = diagnostics[['subject_id', 'hadm_id','sepsis']]

In [None]:
diagnostics = diagnostics.groupby(['subject_id', 'hadm_id'], as_index=False)['sepsis'].max()


In [None]:
sepsis_diagnoses  = pd.merge(sepsis_patients,diagnostics,on=['subject_id','hadm_id'],how = 'left')

In [None]:
sepsis_diagnoses['Cluster'] = final_clusters['Cluster']

In [None]:
sepsis_diagnoses['sepsis'] = sepsis_diagnoses['sepsis'].apply(lambda x: 1 if x == True else 0)

In [None]:
# Contando o número total de pacientes em cada cluster
cluster_counts = sepsis_diagnoses.groupby('Cluster').size().reset_index(name='Total Patients')

# Contando o número de pacientes com sepsis em cada cluster
sepsis_counts = sepsis_diagnoses[sepsis_diagnoses['sepsis'] == True].groupby('Cluster').size().reset_index(name='Patients with Sepsis')

# Mesclando os dois DataFrames para obter uma tabela com total de pacientes e pacientes com sepsis
cluster_stats = pd.merge(cluster_counts, sepsis_counts, on='Cluster', how='left')

# Preenchendo valores NaN na coluna 'Patients with Sepsis' com 0 (caso não haja pacientes com sepsis em algum cluster)
cluster_stats['Patients with Sepsis'] = cluster_stats['Patients with Sepsis'].fillna(0).astype(int)

# Calculando a porcentagem de pacientes com sepsis para cada cluster
cluster_stats['Percentage with Sepsis (%)'] = (cluster_stats['Patients with Sepsis'] / cluster_stats['Total Patients']) * 100
cluster_stats

Unnamed: 0,Cluster,Total Patients,Patients with Sepsis,Percentage with Sepsis (%)
0,0,5725,3192,55.755459
1,1,10401,3694,35.515816
2,2,11008,5386,48.928052
3,3,5681,1491,26.245379
4,4,3766,990,26.287839


In [None]:
print(cluster_stats.to_markdown())

|    |   Cluster |   Total Patients |   Patients with Sepsis |   Percentage with Sepsis (%) |
|---:|----------:|-----------------:|-----------------------:|-----------------------------:|
|  0 |         0 |             5725 |                   3192 |                      55.7555 |
|  1 |         1 |            10401 |                   3694 |                      35.5158 |
|  2 |         2 |            11008 |                   5386 |                      48.9281 |
|  3 |         3 |             5681 |                   1491 |                      26.2454 |
|  4 |         4 |             3766 |                    990 |                      26.2878 |


#### Análise variáveis sofa

In [None]:
sofa_worst_24h['Cluster'] = final_clusters['Cluster']

In [None]:
# Agrupando por 'Cluster'
grouped = sofa_worst_24h.groupby('Cluster')

# Calculando a média e a mediana para cada coluna numérica
mean_df = grouped.mean().reset_index()
median_df = grouped.median().reset_index()

# Renomeando colunas para identificar origem
mean_df.columns = [f'{col}_mean' if col != 'Cluster' else col for col in mean_df.columns]
median_df.columns = [f'{col}_median' if col != 'Cluster' else col for col in median_df.columns]

# Mesclando os DataFrames de média e mediana
combined_df = pd.merge(mean_df, median_df, on='Cluster')
combined_df

Unnamed: 0,Cluster,Unnamed: 0_mean,subject_id_mean,hadm_id_mean,bilirubin_max_mean,creatinine_max_mean,liver_24hours_mean,cns_24hours_mean,renal_24hours_mean,sofa_24hours_mean,...,liver_24hours_median,cns_24hours_median,renal_24hours_median,sofa_24hours_median,pao2fio2ratio_novent_median,pao2fio2ratio_vent_median,platelet_min_median,respiration_24hours_median,coagulation_24hours_median,cardiovascular_24hours_median
0,0,18368.661135,15024530.0,25019280.0,5.776581,2.289339,1.505239,0.745285,1.572681,8.363307,...,2.0,0.0,1.0,8.0,187.5,195.5,116.0,0.0,0.0,0.0
1,1,18256.072589,14993960.0,24975910.0,0.978023,1.190158,0.130371,0.543273,0.523075,3.952381,...,0.0,0.0,0.0,3.0,194.0,222.5,176.0,0.0,0.0,0.0
2,2,18261.182685,14995320.0,24998760.0,1.190046,2.199058,0.244501,0.601089,1.315152,4.839036,...,0.0,0.0,1.0,4.0,174.0,209.5,194.0,0.0,0.0,0.0
3,3,18337.908643,15016000.0,25046190.0,1.121248,1.422494,0.185667,0.439516,0.739736,5.84824,...,0.0,0.0,0.0,5.0,195.0,188.0,147.0,0.0,0.0,0.0
4,4,18276.084971,14999670.0,25005440.0,1.453152,1.536344,0.253319,3.265003,0.895911,7.906267,...,0.0,3.0,0.0,7.0,197.5,210.0,169.0,0.0,0.0,0.0


In [None]:
grouped = sofa_worst_24h.groupby('Cluster')

# Iterar sobre as colunas numéricas (excluindo 'Cluster')
for col in sofa_worst_24h.columns:
    if col != 'Cluster':
        # Calcular mediana
        median = grouped[col].median()
        
        # Calcular quartis Q25 e Q75
        q25 = grouped[col].quantile(0.25)
        q75 = grouped[col].quantile(0.75)
        
        # Imprimir resultados
        print(f"Coluna: {col}")
        print("Mediana por Cluster:")
        print(median)
        print("Quartis Q25 e Q75 por Cluster:")
        for cluster in median.index:
            print(f"Cluster {cluster}:{q25[cluster]:.2f} - {q75[cluster]:.2f}")
        print("\n")

Coluna: Unnamed: 0
Mediana por Cluster:
Cluster
0    18434.0
1    18218.0
2    18190.5
3    18547.0
4    18086.5
Name: Unnamed: 0, dtype: float64
Quartis Q25 e Q75 por Cluster:
Cluster 0:9345.00 - 27325.00
Cluster 1:9162.00 - 27393.00
Cluster 2:9021.75 - 27460.25
Cluster 3:9242.00 - 27437.00
Cluster 4:9096.25 - 27580.75


Coluna: subject_id
Mediana por Cluster:
Cluster
0    15048951.0
1    14991624.0
2    14982919.0
3    15083733.0
4    14952781.0
Name: subject_id, dtype: float64
Quartis Q25 e Q75 por Cluster:
Cluster 0:12563013.00 - 17469778.00
Cluster 1:12504186.00 - 17488816.00
Cluster 2:12468016.00 - 17509933.50
Cluster 3:12531971.00 - 17503124.00
Cluster 4:12489753.50 - 17542631.00


Coluna: hadm_id
Mediana por Cluster:
Cluster
0    25056243.0
1    24958095.0
2    24970383.5
3    25067017.0
4    25056978.0
Name: hadm_id, dtype: float64
Quartis Q25 e Q75 por Cluster:
Cluster 0:22537178.00 - 27525395.00
Cluster 1:22485177.00 - 27484316.00
Cluster 2:22554160.00 - 27475490.50
Cluster 