In [282]:
from typing import List
from typing import Tuple

import pandas as pd
import numpy as np

# Preprocessing package
from sklearn.model_selection import train_test_split

# Standardization
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

# Non-Linear transformation
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

# Normalization
from sklearn.preprocessing import Normalizer

# Standardization
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import Binarizer

# Functions

In [283]:
def preprocessing_numerical_variables(
    df_train: pd.DataFrame, variables: List[str], method: str
) -> Tuple[pd.DataFrame, object]:
    """
    Performs preprocessing on numerical variables in a training DataFrame.

    Args:
        df_train (pd.DataFrame): Training DataFrame containing the variables to be preprocessed.
        variables (List[str]): List of names of the numerical variables to be preprocessed.
        method (str): Preprocessing method to be used. Currently, only "StandardScaler" is supported.

    Returns:
        Tuple[pd.DataFrame, object]: Tuple containing the DataFrame with the preprocessed numerical variables and the preprocessing method object.

    Raises:
        ValueError: If the provided preprocessing method is not supported.
    """
    df = df_train.copy()

    

    if method == "StandardScaler":
        preprocessing_method = StandardScaler()
    else:
        raise ValueError("Unsupported preprocessing method.")

    X_train = df.loc[:, variables].values
    X_train_processed = preprocessing_method.fit_transform(X_train)

    df.loc[:, variables] = X_train_processed

    return df, preprocessing_method

In [284]:
# load data
csv_link = "https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv"
df = pd.read_csv(csv_link)
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [285]:
# size dataframe
df.shape

(299, 13)

In [286]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [287]:
print("No Death patients: ", df.loc[df["DEATH_EVENT"] == 0].shape[0])
print("Death patients: ", df.loc[df["DEATH_EVENT"] == 1].shape[0])

No Death patients:  203
Death patients:  96


Dataframe desbalanceado

In [288]:
variables = ["anaemia", "high_blood_pressure", "diabetes", "sex", "smoking"]

summary_list = []

for var in variables:
    # Calculando para toda a amostra
    full_sample_count = df[var].value_counts()
    full_sample_percent = df[var].value_counts(normalize=True) * 100

    # Calculando para pacientes mortos
    dead_sample_count = df[df["DEATH_EVENT"] == 1][var].value_counts()
    dead_sample_percent = (
        df[df["DEATH_EVENT"] == 1][var].value_counts(normalize=True) * 100
    )

    # Calculando para pacientes que sobreviveram
    survived_sample_count = df[df["DEATH_EVENT"] == 0][var].value_counts()
    survived_sample_percent = (
        df[df["DEATH_EVENT"] == 0][var].value_counts(normalize=True) * 100
    )

    for val in [0, 1]:
        temp_df = pd.DataFrame(
            {
                "Variable": var,
                "Bool": val,
                "Full Sample #": full_sample_count.get(val, 0),
                "Full Sample %": full_sample_percent.get(val, 0),
                "Dead Patients #": dead_sample_count.get(val, 0),
                "Dead Patients %": dead_sample_percent.get(val, 0),
                "Survived Patients #": survived_sample_count.get(val, 0),
                "Survived Patients %": survived_sample_percent.get(val, 0),
            },
            index=[0],
        )
        summary_list.append(temp_df)

# Fazendo print da tabela resumida
summary_df = pd.concat(summary_list, ignore_index=True)
summary_df

Unnamed: 0,Variable,Bool,Full Sample #,Full Sample %,Dead Patients #,Dead Patients %,Survived Patients #,Survived Patients %
0,anaemia,0,170,56.856187,50,52.083333,120,59.1133
1,anaemia,1,129,43.143813,46,47.916667,83,40.8867
2,high_blood_pressure,0,194,64.882943,57,59.375,137,67.487685
3,high_blood_pressure,1,105,35.117057,39,40.625,66,32.512315
4,diabetes,0,174,58.19398,56,58.333333,118,58.128079
5,diabetes,1,125,41.80602,40,41.666667,85,41.871921
6,sex,0,105,35.117057,34,35.416667,71,34.975369
7,sex,1,194,64.882943,62,64.583333,132,65.024631
8,smoking,0,203,67.892977,66,68.75,137,67.487685
9,smoking,1,96,32.107023,30,31.25,66,32.512315


# Preprocessing data

In [289]:
columns = df.columns
target = "DEATH_EVENT"
features = columns.drop(target)

# Especifique as colunas que são as variáveis independentes (X) e a variável dependente (y)
X = df.loc[:, features].copy()
y = df.loc[:, target].copy()  # substitua 'target' pelo nome da sua coluna alvo

# Divida os dados em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# X_train: conjunto de treinamento de características (variáveis independentes)
# X_test: conjunto de teste de características (variáveis independentes)
# y_train: conjunto de treinamento do alvo (variável dependente)
# y_test: conjunto de teste do alvo (variável dependente)

# Verifique os tamanhos dos conjuntos de treinamento e teste
print("Tamanho do conjunto de treinamento:", len(X_train))
print("Tamanho do conjunto de teste:", len(X_test))

Tamanho do conjunto de treinamento: 209
Tamanho do conjunto de teste: 90


In [290]:
min_max_variables = ["age", "ejection_fraction", "time"]

standard_variables = [
    "creatinine_phosphokinase",
    "platelets",
    "serum_creatinine",
    "serum_sodium",
]

# Aplicar a transformação logarítmica -> Assimetria Positiva ou Negativa
# X_train["log_serum_creatinine"] = np.log1p(X_train["serum_creatinine"])
# X_train["log_creatinine_phosphokinase"] = np.log1p(X_train["creatinine_phosphokinase"])

min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

X_train[min_max_variables] = min_max_scaler.fit_transform(
    X_train[min_max_variables].values
)
X_test[min_max_variables] = min_max_scaler.transform(X_test[min_max_variables].values)

X_train[standard_variables] = standard_scaler.fit_transform(
    X_train[standard_variables].values
)
X_test[standard_variables] = standard_scaler.transform(
    X_test[standard_variables].values
)

In [291]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Definir o espaço de busca para os hiperparâmetros
param_grid = {
    "n_estimators": [100, 200, 300],  # número de árvores na floresta
    "max_depth": [None, 5, 10],  # profundidade máxima de cada árvore
    "min_samples_split": [
        2,
        5,
        10,
    ],  # número mínimo de amostras necessárias para dividir um nó interno
    "min_samples_leaf": [
        1,
        2,
        4,
    ],  # número mínimo de amostras necessárias em um nó folha
}

# Instanciar o classificador Random Forest
rf = RandomForestClassifier()

# Realizar a busca em grade exaustiva com validação cruzada
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Obter os melhores hiperparâmetros encontrados
best_params = grid_search.best_params_

# Criar o classificador Random Forest com os melhores hiperparâmetros
best_rf = RandomForestClassifier(**best_params)

# Ajustar o classificador aos dados de treinamento
best_rf.fit(X_train, y_train)

# Obter o grau de importância dos atributos com o classificador otimizado
importance = best_rf.feature_importances_

# Criar um DataFrame com os atributos e suas importâncias
feature_importance = pd.DataFrame(
    {"Feature": X_train.columns, "Importance": importance}
)
feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Exibir o grau de importância dos atributos
feature_importance

Unnamed: 0,Feature,Importance
11,time,0.469917
7,serum_creatinine,0.174505
4,ejection_fraction,0.093704
0,age,0.080262
8,serum_sodium,0.060961
2,creatinine_phosphokinase,0.04919
6,platelets,0.034887
9,sex,0.010085
5,high_blood_pressure,0.007632
3,diabetes,0.006617


In [292]:
best_params

{'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'n_estimators': 100}