O notebook abaixo se refere a análise dos clusters gerado pelo K-means. Os datasets utilizados foram gerados pelos outros notebooks responsáveis pela extração dos dados e geração dos modelos. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
#from missforest import MissForest
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.manifold import trustworthiness
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture
import scipy.stats as stats


Os dados "worst_exams_24h" e "sofa_worst_24h" podem ser gerados a partir do notebook extract_exams_24h. Os dados "admissions" e "patients" são bases originais da MIMIC-IV. Por fim, "final_clusters" pode ser gerado pelo notebook final_model.

In [40]:
worst_exams_24h = pd.read_csv("../data/worst/worst_exams_24h_.csv")
sofa_worst_24h= pd.read_csv("../data/worst/sofa_worst_24h.csv")
admissions = pd.read_csv('/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/admissions.csv')
patients = pd.read_csv('/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/patients.csv')
final_clusters = pd.read_csv("final_clusters.csv")



#### Definindo pacientes

In [3]:
df_gmm = pd.read_csv('analiseGMM.csv')
df_gmm_intersecao = pd.read_csv('analiseGMM_intersecao.csv')

#### Análise da mortalidade

In [33]:
death_patients  = df_gmm_intersecao

In [34]:
death_patients['died'] = death_patients['unitdischargestatus'].apply(lambda x: 0 if x == "Alive" else 1)

In [35]:
death_patients = death_patients[['patientunitstayid','died','unitdischargeoffset','Cluster']]

In [36]:
cluster_deaths = death_patients.groupby('Cluster')['died'].sum().reset_index()

# Renomeando a coluna para melhor entendimento
cluster_deaths.rename(columns={'died': 'Total Deaths'}, inplace=True)

# Exibindo o resultado
cluster_deaths

Unnamed: 0,Cluster,Total Deaths
0,0,1270
1,1,538
2,2,1580


In [37]:
# Contando o número total de pacientes em cada cluster
cluster_counts = death_patients.groupby('Cluster').size().reset_index(name='Total Patients')

# Calculando a taxa de mortalidade
cluster_deaths['Total Patients'] = cluster_counts['Total Patients']
cluster_deaths['Mortality Rate (%)'] = (cluster_deaths['Total Deaths'] / cluster_deaths['Total Patients']) * 100

# Exibindo o resultado
cluster_deaths


Unnamed: 0,Cluster,Total Deaths,Total Patients,Mortality Rate (%)
0,0,1270,13816,9.192241
1,1,538,2354,22.854715
2,2,1580,13838,11.417835


In [38]:
contingency_table = pd.crosstab(death_patients['Cluster'], death_patients['died'])

# Realizando o teste qui-quadrado
chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

# Exibindo os resultados
print(f"Estatística Qui-Quadrado: {chi2}")
print(f"p-valor: {p_value}")

# Avaliando o resultado
alpha = 0.05
if p_value < alpha:
    print("Rejeitamos a hipótese nula. Há uma diferença significativa nas taxas de mortalidade entre os clusters.")
else:
    print("Falhamos em rejeitar a hipótese nula. Não há diferença significativa nas taxas de mortalidade entre os clusters.")

Estatística Qui-Quadrado: 375.26916254833924
p-valor: 3.2459119769761574e-82
Rejeitamos a hipótese nula. Há uma diferença significativa nas taxas de mortalidade entre os clusters.


#### Tempo na uti

In [39]:
uti_time = df_gmm_intersecao[['patientunitstayid','unitdischargeoffset','Cluster']]

In [50]:
icustays['intime'] = pd.to_datetime(icustays['intime'])
icustays['outtime'] = pd.to_datetime(icustays['outtime'])

# Calculando o tempo de UTI
icustays['uti_time'] = (icustays['outtime'] - icustays['intime']).dt.total_seconds() / 3600


In [51]:
uti_time = icustays[['subject_id','hadm_id','uti_time']]


In [19]:
uti_time = uti_time.groupby(['patientunitstayid'], as_index=False)['unitdischargeoffset'].sum()

In [53]:
uti_time  = pd.merge(sepsis_patients,uti_time,on=['subject_id','hadm_id'],how = 'left')

In [26]:
uti_time = uti_time.fillna(0)

In [55]:
uti_time['Cluster'] = final_clusters['Cluster']

In [40]:
def calculate_quantiles(group):
    q25 = group.quantile(0.25)
    q75 = group.quantile(0.75)
    return pd.Series({'Q25': q25, 'Q75': q75})

# Calcular Q25 e Q75
quantiles_df = uti_time.groupby('Cluster')['unitdischargeoffset'].apply(calculate_quantiles).reset_index()

# Organizar as colunas
quantiles_df = quantiles_df.rename(columns={'unitdischargeoffset': 'Q25', 'Q75': 'Q75'})
quantiles_df

Unnamed: 0,Cluster,level_1,Q25
0,0,Q25,1619.0
1,0,Q75,6142.25
2,1,Q25,2461.0
3,1,Q75,11557.75
4,2,Q25,1537.25
5,2,Q75,5633.75


In [27]:
anova_result = stats.f_oneway(*(uti_time[uti_time['Cluster'] == cluster]['unitdischargeoffset'] for cluster in uti_time['unitdischargeoffset'].unique()))

# Exibindo o p-valor
print(f"ANOVA p-valor: {anova_result.pvalue}")

ANOVA p-valor: nan


  anova_result = stats.f_oneway(*(uti_time[uti_time['Cluster'] == cluster]['unitdischargeoffset'] for cluster in uti_time['unitdischargeoffset'].unique()))


#### Análise do diagnóstico

In [None]:
diagnostics = pd.read_csv("../data/diagnostics.csv")

In [None]:
diagnostics = diagnostics[['subject_id', 'hadm_id','sepsis']]

In [None]:
diagnostics = diagnostics.groupby(['subject_id', 'hadm_id'], as_index=False)['sepsis'].max()


In [None]:
sepsis_diagnoses  = pd.merge(sepsis_patients,diagnostics,on=['subject_id','hadm_id'],how = 'left')

In [None]:
sepsis_diagnoses['Cluster'] = final_clusters['Cluster']

In [None]:
sepsis_diagnoses['sepsis'] = sepsis_diagnoses['sepsis'].apply(lambda x: 1 if x == True else 0)

In [None]:
# Contando o número total de pacientes em cada cluster
cluster_counts = sepsis_diagnoses.groupby('Cluster').size().reset_index(name='Total Patients')

# Contando o número de pacientes com sepsis em cada cluster
sepsis_counts = sepsis_diagnoses[sepsis_diagnoses['sepsis'] == True].groupby('Cluster').size().reset_index(name='Patients with Sepsis')

# Mesclando os dois DataFrames para obter uma tabela com total de pacientes e pacientes com sepsis
cluster_stats = pd.merge(cluster_counts, sepsis_counts, on='Cluster', how='left')

# Preenchendo valores NaN na coluna 'Patients with Sepsis' com 0 (caso não haja pacientes com sepsis em algum cluster)
cluster_stats['Patients with Sepsis'] = cluster_stats['Patients with Sepsis'].fillna(0).astype(int)

# Calculando a porcentagem de pacientes com sepsis para cada cluster
cluster_stats['Percentage with Sepsis (%)'] = (cluster_stats['Patients with Sepsis'] / cluster_stats['Total Patients']) * 100
cluster_stats

Unnamed: 0,Cluster,Total Patients,Patients with Sepsis,Percentage with Sepsis (%)
0,0,5725,3192,55.755459
1,1,10401,3694,35.515816
2,2,11008,5386,48.928052
3,3,5681,1491,26.245379
4,4,3766,990,26.287839


In [None]:
print(cluster_stats.to_markdown())

|    |   Cluster |   Total Patients |   Patients with Sepsis |   Percentage with Sepsis (%) |
|---:|----------:|-----------------:|-----------------------:|-----------------------------:|
|  0 |         0 |             5725 |                   3192 |                      55.7555 |
|  1 |         1 |            10401 |                   3694 |                      35.5158 |
|  2 |         2 |            11008 |                   5386 |                      48.9281 |
|  3 |         3 |             5681 |                   1491 |                      26.2454 |
|  4 |         4 |             3766 |                    990 |                      26.2878 |


#### Análise variáveis sofa

In [41]:
sofa_worst_24h = df_gmm_intersecao[['sofascore','Cluster']]

In [42]:
# Agrupando por 'Cluster'
grouped = sofa_worst_24h.groupby('Cluster')

# Calculando a média e a mediana para cada coluna numérica
mean_df = grouped.mean().reset_index()
median_df = grouped.median().reset_index()

# Renomeando colunas para identificar origem
mean_df.columns = [f'{col}_mean' if col != 'Cluster' else col for col in mean_df.columns]
median_df.columns = [f'{col}_median' if col != 'Cluster' else col for col in median_df.columns]

# Mesclando os DataFrames de média e mediana
combined_df = pd.merge(mean_df, median_df, on='Cluster')
combined_df

Unnamed: 0,Cluster,sofascore_mean,sofascore_median
0,0,4.982388,5.0
1,1,5.131037,5.000524
2,2,4.976128,5.0


In [43]:
grouped = sofa_worst_24h.groupby('Cluster')

# Iterar sobre as colunas numéricas (excluindo 'Cluster')
for col in sofa_worst_24h.columns:
    if col != 'Cluster':
        # Calcular mediana
        median = grouped[col].median()
        
        # Calcular quartis Q25 e Q75
        q25 = grouped[col].quantile(0.25)
        q75 = grouped[col].quantile(0.75)
        
        # Imprimir resultados
        print(f"Coluna: {col}")
        print("Mediana por Cluster:")
        print(median)
        print("Quartis Q25 e Q75 por Cluster:")
        for cluster in median.index:
            print(f"Cluster {cluster}:{q25[cluster]:.2f} - {q75[cluster]:.2f}")
        print("\n")

Coluna: sofascore
Mediana por Cluster:
Cluster
0    5.000000
1    5.000524
2    5.000000
Name: sofascore, dtype: float64
Quartis Q25 e Q75 por Cluster:
Cluster 0:4.50 - 5.38
Cluster 1:4.68 - 5.53
Cluster 2:4.56 - 5.38


