In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.under_sampling import RandomUnderSampler


In [2]:
df = pd.read_parquet('/home/bianka/PISI3_2022.2/data/dataset_renomeado.parquet')


In [3]:

df = df[['ventilado_apache','tipo_estadia_uti', 'd1_frequencia_cardiaca_maxima',
         'd1_frequencia_cardiaca_minima', 'h1_frequencia_respiratoria_maxima',
         'd1_spO2_minimo','d1_spO2_maximo','d1_temperatura_minima','d1_temperatura_maxima',
         'h1_frequencia_respiratoria_minima',
          'morte_hospital']]
df = df.dropna()


one_hot_encoded = pd.get_dummies(df['tipo_estadia_uti'])
df = pd.concat([df, one_hot_encoded], axis=1)


In [4]:
# Use o método 'value_counts' para contar os valores únicos na coluna 'morte_hospital'
contagem_morte_hospital = df['morte_hospital'].value_counts()

# Exiba os resultados
print("Volume de dados em relação ao valor 0 em morte_hospital:", contagem_morte_hospital[0])
print("Volume de dados em relação ao valor 1 em morte_hospital:", contagem_morte_hospital[1])

Volume de dados em relação ao valor 0 em morte_hospital: 77509
Volume de dados em relação ao valor 1 em morte_hospital: 7215


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84724 entries, 0 to 91712
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ventilado_apache                   84724 non-null  float64
 1   tipo_estadia_uti                   84724 non-null  object 
 2   d1_frequencia_cardiaca_maxima      84724 non-null  float64
 3   d1_frequencia_cardiaca_minima      84724 non-null  float64
 4   h1_frequencia_respiratoria_maxima  84724 non-null  float64
 5   d1_spO2_minimo                     84724 non-null  float64
 6   d1_spO2_maximo                     84724 non-null  float64
 7   d1_temperatura_minima              84724 non-null  float64
 8   d1_temperatura_maxima              84724 non-null  float64
 9   h1_frequencia_respiratoria_minima  84724 non-null  float64
 10  morte_hospital                     84724 non-null  int64  
 11  admit                              84724 non-null  uin

In [6]:
def categorizar_fc_maxima(fc):
    if fc < 60:
        return 'primeiras_24_horas_frequencia_cardiaca_maxima_muito_baixa'
    elif 60 <= fc < 70:
        return 'primeiras_24_horas_frequencia_cardiaca_maxima_baixa'
    elif 70 <= fc < 100:
        return 'primeiras_24_horas_frequencia_cardiaca_maxima_normal'
    elif 100 <= fc < 120:
        return 'primeiras_24_horas_frequencia_cardiaca_maxima_elevada'
    else:
        return 'primeiras_24_horas_frequencia_cardiaca_maxima_muito_elevada'

df['categoria_fc_maxima'] = df['d1_frequencia_cardiaca_maxima'].apply(categorizar_fc_maxima)

one_hot_encoded = pd.get_dummies(df['categoria_fc_maxima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_fc_maxima', axis=1)

In [7]:
def categorizar_fr(fr):
    if fr < 12:
        return 'primeira_hora_frequencia_respiratoria_maxima_muito_baixa'
    elif 12 <= fr < 16:
        return 'primeira_hora_frequencia_respiratoria_maxima_baixa'
    elif 16 <= fr < 20:
        return 'primeira_hora_frequencia_respiratoria_maxima_normal'
    elif 20 <= fr < 24:
        return 'primeira_hora_frequencia_respiratoria_maxima_elevada'
    else:
        return 'primeira_hora_frequencia_respiratoria_maxima_muito elevada'

df['categoria_frequencia_respiratoria_maxima'] = df['h1_frequencia_respiratoria_maxima'].apply(categorizar_fr)

one_hot_encoded = pd.get_dummies(df['categoria_frequencia_respiratoria_maxima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_frequencia_respiratoria_maxima', axis=1)



In [8]:
def categorizar_fr(fr):
    if fr < 12:
        return 'primeira_hora_frequencia_respiratoria_minima_muito_baixa'
    elif 12 <= fr < 16:
        return 'primeira_hora_frequencia_respiratoria_minima_baixa'
    elif 16 <= fr < 20:
        return 'primeira_hora_frequencia_respiratoria_minima_normal'
    elif 20 <= fr < 24:
        return 'primeira_hora_frequencia_respiratoria_minima_elevada'
    else:
        return 'primeira_hora_frequencia_respiratoria_minima_muito elevada'

df['categoria_frequencia_respiratoria_minima'] = df['h1_frequencia_respiratoria_minima'].apply(categorizar_fr)

one_hot_encoded = pd.get_dummies(df['categoria_frequencia_respiratoria_minima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_frequencia_respiratoria_minima', axis=1)


In [9]:
df = df[~((df['d1_spO2_minimo'] <= 20))]

def categorize_saturacao(sat):
    if sat < 85:
        return 'saturacao_primieras_vinte_quatro_horas_minima_muito_baixa'
    elif sat < 90:
        return 'saturacao_primieras_vinte_quatro_horas_minima_baixa'
    elif sat < 95:
        return 'saturacao_primieras_vinte_quatro_horas_minima_moderada'
    elif sat < 100:
        return 'saturacao_primieras_vinte_quatro_horas_minima_boa'


df['categoria_d1_spO2_minimo'] = df['d1_spO2_minimo'].apply(categorizar_fr)

one_hot_encoded = pd.get_dummies(df['categoria_d1_spO2_minimo'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_d1_spO2_minimo', axis=1)

In [10]:
# apaga linhas com 
df = df[~((df['d1_spO2_maximo'] <= 20))]

def categorize_saturacao(sat):
    if sat < 85:
        return 'saturacao_primieras_vinte_quatro_horas_maxima_muito_baixa'
    elif sat < 90:
        return 'saturacao_primieras_vinte_quatro_horas_maxima_baixa'
    elif sat < 95:
        return 'saturacao_primieras_vinte_quatro_horas_maxima_moderada'
    elif sat < 100:
        return 'saturacao_primieras_vinte_quatro_horas_maxima_boa'


df['categoria_d1_spO2_maxima'] = df['d1_spO2_maximo'].apply(categorizar_fr)

one_hot_encoded = pd.get_dummies(df['categoria_d1_spO2_maxima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_d1_spO2_maxima', axis=1)

In [11]:
def categorize_temperatura(temp):
    if temp < 35:
        return 'temperatura_minima_primieras_vinte_quatro_horas_hipotermia'
    if temp < 37:
        return 'temperatura_minima_primieras_vinte_quatro_horas_normal'
    if temp >= 37:
        return 'temperatura_minima_primieras_vinte_quatro_horas_febre'

df['categoria_d1_temperatura_minima'] = df['d1_temperatura_minima'].apply(categorize_temperatura)

one_hot_encoded = pd.get_dummies(df['categoria_d1_temperatura_minima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_d1_temperatura_minima', axis=1)


In [12]:

def categorize_temperatura(temp_max):
    if temp_max < 35:
        return 'temperatura_minima_primieras_vinte_quatro_horas_hipotermia'
    if temp_max < 37:
        return 'temperatura_minima_primieras_vinte_quatro_horas_normal'
    if temp_max >= 37:
        return 'temperatura_minima_primieras_vinte_quatro_horas_febre'

df['categoria_d1_temperatura_minima'] = df['d1_temperatura_maxima'].apply(categorize_temperatura)

one_hot_encoded = pd.get_dummies(df['categoria_d1_temperatura_minima'])

df = pd.concat([df, one_hot_encoded], axis=1)

df = df.drop('categoria_d1_temperatura_minima', axis=1)


In [13]:
df = df.drop(['d1_frequencia_cardiaca_maxima',
       'd1_frequencia_cardiaca_minima', 'h1_frequencia_respiratoria_maxima',
       'd1_spO2_minimo','d1_spO2_maximo','d1_temperatura_minima','tipo_estadia_uti','h1_frequencia_respiratoria_minima'], axis=1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84314 entries, 0 to 91712
Data columns (total 29 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   ventilado_apache                                             84314 non-null  float64
 1   d1_temperatura_maxima                                        84314 non-null  float64
 2   morte_hospital                                               84314 non-null  int64  
 3   admit                                                        84314 non-null  uint8  
 4   readmit                                                      84314 non-null  uint8  
 5   transfer                                                     84314 non-null  uint8  
 6   primeiras_24_horas_frequencia_cardiaca_maxima_baixa          84314 non-null  uint8  
 7   primeiras_24_horas_frequencia_cardiaca_maxima_elevada        84314 non-null 

In [15]:
y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)  
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

X_train_resampled, X_test, y_train_resampled, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.909891478384629
Confusion Matrix:
[[30292   621]
 [ 2418   395]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     30913
           1       0.39      0.14      0.21      2813

    accuracy                           0.91     33726
   macro avg       0.66      0.56      0.58     33726
weighted avg       0.88      0.91      0.89     33726



In [16]:
from imblearn.over_sampling import SMOTE

y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)  # Você pode ajustar o valor de 'sampling_strategy' conforme necessário
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Antes do SMOTE:")
print(pd.Series(y_train).value_counts())


print("Após o SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Antes do SMOTE:
0    51752
1     4738
Name: morte_hospital, dtype: int64
Após o SMOTE:
0    51752
1    51752
Name: morte_hospital, dtype: int64


In [17]:
y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)  
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train_resampled, y_train_resampled)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.8384629069560576
Confusion Matrix:
[[27276  3637]
 [ 1811  1002]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.88      0.91     30913
           1       0.22      0.36      0.27      2813

    accuracy                           0.84     33726
   macro avg       0.58      0.62      0.59     33726
weighted avg       0.88      0.84      0.86     33726



## Aplicando Random Over Sampling (ROS)

In [18]:
from imblearn.over_sampling import RandomOverSampler

y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

print("Antes do SMOTE:")
print(pd.Series(y_train).value_counts())


print("Após o SMOTE:")
print(pd.Series(y_train_resampled).value_counts())


Antes do SMOTE:
0    51752
1     4738
Name: morte_hospital, dtype: int64
Após o SMOTE:
0    51752
1    51752
Name: morte_hospital, dtype: int64


In [19]:
from imblearn.over_sampling import RandomOverSampler

y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train_resampled, y_train_resampled)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)



Accuracy: 0.7576054082903398
Confusion Matrix:
[[24137  6776]
 [ 1399  1414]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.78      0.86     30913
           1       0.17      0.50      0.26      2813

    accuracy                           0.76     33726
   macro avg       0.56      0.64      0.56     33726
weighted avg       0.88      0.76      0.81     33726



Adaptive Synthetic Sampling (ADASYN)


In [20]:
from imblearn.over_sampling import ADASYN

y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

adasyn = ADASYN(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train_resampled, y_train_resampled)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.7576054082903398
Confusion Matrix:
[[24137  6776]
 [ 1399  1414]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.78      0.86     30913
           1       0.17      0.50      0.26      2813

    accuracy                           0.76     33726
   macro avg       0.56      0.64      0.56     33726
weighted avg       0.88      0.76      0.81     33726



NearMiss

In [21]:
from imblearn.under_sampling import NearMiss

y = df['morte_hospital']
X = df.drop(['morte_hospital'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

nm = NearMiss()
X_train_resampled, y_train_resampled = nm.fit_resample(X, y)

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

random_forest.fit(X_train_resampled, y_train_resampled)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Accuracy: 0.3763268694775544
Confusion Matrix:
[[10496 20417]
 [  617  2196]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.34      0.50     30913
           1       0.10      0.78      0.17      2813

    accuracy                           0.38     33726
   macro avg       0.52      0.56      0.34     33726
weighted avg       0.87      0.38      0.47     33726



In [22]:

from imblearn.combine import SMOTEENN

smoteenn = SMOTEENN(random_state=42)

X_resampled, y_resampled = smoteenn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Matrix de Confusão:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Relatório de Classificação :")
print(class_report)


Accuracy: 0.9932113717491076
Matrix de Confusão:
[[20646    31]
 [  182 10517]]
Relatório de Classificação :
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     20677
           1       1.00      0.98      0.99     10699

    accuracy                           0.99     31376
   macro avg       0.99      0.99      0.99     31376
weighted avg       0.99      0.99      0.99     31376



In [23]:

from imblearn.combine import SMOTEENN


smoteenn = SMOTEENN(random_state=42)

X_resampled, y_resampled = smoteenn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.33, random_state=42)


print(pd.Series(y).value_counts())

print(pd.Series(y_train_resampled).value_counts())

print(pd.Series(y_train).value_counts())


0    77242
1     7072
Name: morte_hospital, dtype: int64
0    7072
1    7072
Name: morte_hospital, dtype: int64
0    41881
1    21821
Name: morte_hospital, dtype: int64


In [None]:
[[4303    2150]
 [832   9617]]