# Modelo de AVC (stroke)

# Variável resposta (target)

- stroke                - binária

# Variáveis "já prontas"

- heart_disease         - binária
- hypertension          - binária

# Variáveis tratadas

- ever_married          - binária
- Residence_type        - binária
- gender                - binária
- age                   - discreta
- work_type             - múltiplas categorias
- avg_glucose_level     - contínua
- bmi                   - contínua
- smoking_status        - múltiplas categorias

# Variáveis removidas

- id

In [None]:
import pandas as pd
import kagglehub
import os

# Download latest version
stroke = "fedesoriano/stroke-prediction-dataset"
hearth = "fedesoriano/heart-failure-prediction"

def carregar_df_kaggle(caminho):
    # carregando o conjunto de dados
    caminho = kagglehub.dataset_download(caminho)
    arquivo = os.listdir(caminho)[0]
    caminho_final = os.path.join(caminho, arquivo)
    df = pd.read_csv(caminho_final)

    # verificando descritiva
    print(df.describe())
    print(df.sample(5))
    return df

# dados de avc (stroke)
df_stroke = carregar_df_kaggle(stroke)

                 id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.000000  
75%           114.090000    33.100000     0

## Utilização do sweetviz para EDA e tratamento dos dados de stroke

In [None]:
#!pip install sweetviz

In [None]:
#!pip install numpy==1.23.5

In [None]:
import sweetviz as sv

eda = sv.analyze(source = df_stroke, target_feat = 'stroke')
eda.show_notebook()

                                             |          | [  0%]   00:00 -> (? left)

Inicialmente foi realizada uma verificação de NAs (dados faltantes), essas observações foram dropadas.

In [None]:
print(df_stroke.isnull().sum())

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [None]:
nas = df_stroke.isnull().sum().sum()
total_observacoes = df_stroke.shape[0]
print(f'{round(nas/total_observacoes * 100, 2)}% das observações contém NA.')

3.93% das observações contém NA.


In [None]:
df_stroke = df_stroke.dropna()

- A variável ID foi dropada do conjunto de dados por sua irrelevância na predição de AVC.

- Na variável gênero, "Other" foi removido para manter parcimônia, já que só contem 1 observação.

In [None]:
df_stroke = df_stroke.drop(columns=["id"])
df_stroke = df_stroke[df_stroke['gender'] != 'Other']
df_stroke['gender'] = df_stroke['gender'].map({'Male': 1, 'Female': 0})

In [None]:
df_stroke['ever_married'] = df_stroke['ever_married'].map({'Yes': 1, 'No': 0})
df_stroke['Residence_type'] = df_stroke['Residence_type'].map({'Urban': 1, 'Rural': 0})

Aqui faremos a dummificazao de "work_type', juntamente com a retirada de "Never_worked", que representa uma quantidade ínfima de observações.

In [None]:
df_stroke = df_stroke[df_stroke['work_type'] != 'Never_worked']
df_stroke = pd.get_dummies(df_stroke, columns=['work_type'], drop_first=True)

Aqui também faremos a dumificação de "smoking_status", mantendo as observações de "Unknown" para evitar perda de informação.

In [None]:
df_stroke = pd.get_dummies(df_stroke, columns=['smoking_status'], drop_first=True)

O tratamento padrão para as variáveis contínuas (e até a discreta idade em anos completos) será a padronização, para manter essas variáveis em escala semelhante, com média 0 e desvio padrão 1.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

colunas_novas = ['age_scaled', 'bmi_scaled', 'avg_glucose_level_scaled']
colunas_antigas = ['age', 'bmi', 'avg_glucose_level']

df_stroke[colunas_novas] = scaler.fit_transform(df_stroke[colunas_antigas])

Aqui serão feitas as transformações finais, dropando as colunas antigas e convertendo Trues e Falses em 1's e 0's

In [None]:
df_stroke = df_stroke.drop(columns=colunas_antigas)

In [None]:
colunas_booleanas = df_stroke.select_dtypes(include='bool').columns
df_stroke[colunas_booleanas] = df_stroke[colunas_booleanas].astype(int)

## Modelagem dos dados

In [None]:
!pip install lightgbm --quiet

# PyCaret

In [None]:
!pip install pycaret -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.8/21.8 MB[0m [31m74.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from pycaret.classification import *

In [None]:
import pandas as pd
df_stroke = pd.read_excel("df_stroke.xlsx")

In [None]:
setup(data = df_stroke, target = 'stroke')

Unnamed: 0,Description,Value
0,Session id,1130
1,Target,stroke
2,Target type,Binary
3,Original data shape,"(4886, 15)"
4,Transformed data shape,"(4886, 15)"
5,Transformed train set shape,"(3420, 15)"
6,Transformed test set shape,"(1466, 15)"
7,Numeric features,14
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7dfef1514b90>

In [None]:
modelos = compare_models( )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9573,0.8221,0.0,0.0,0.0,0.0,0.0,0.997
svm,SVM - Linear Kernel,0.9573,0.7048,0.0,0.0,0.0,0.0,0.0,0.039
ridge,Ridge Classifier,0.9573,0.814,0.0,0.0,0.0,0.0,0.0,0.031
dummy,Dummy Classifier,0.9573,0.5,0.0,0.0,0.0,0.0,0.0,0.027
rf,Random Forest Classifier,0.957,0.7551,0.0,0.0,0.0,-0.0005,-0.0011,0.338
ada,Ada Boost Classifier,0.9558,0.7684,0.0,0.0,0.0,-0.0026,-0.0044,0.181
knn,K Neighbors Classifier,0.955,0.5802,0.0067,0.05,0.0118,0.0068,0.0109,0.08
gbc,Gradient Boosting Classifier,0.9547,0.8054,0.0071,0.1,0.0133,0.0075,0.0167,0.499
et,Extra Trees Classifier,0.9526,0.7104,0.0071,0.0333,0.0118,0.0031,0.0027,0.273
lda,Linear Discriminant Analysis,0.952,0.8141,0.049,0.1758,0.0741,0.0609,0.0736,0.031


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [None]:
!pip install imbalanced-learn --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.
sktime 0.26.0 requires scikit-learn<1.5.0,>=0.24, but you have scikit-learn 1.6.1 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.11.4 which is incompatible.[0m[31m
[0m

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
from scipy.stats import randint
import numpy as np


X = df_stroke.drop('stroke', axis=1)
y = df_stroke['stroke']

smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)


X_trn, X_tst, y_trn, y_tst = train_test_split(
    X_res, y_res, test_size=0.15, stratify=y_res
)


modelo = RandomForestClassifier()

param_distribuicoes = {
    "n_estimators": randint(100, 600),
    "max_depth": randint(5, 60),
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(1, 5),
    "bootstrap": [True, False]
}

scoring = {
    'f1': 'f1',
    'recall': 'recall',
    'accuracy': 'accuracy'
}

kfold = KFold(n_splits=5, shuffle=True)

random_search = RandomizedSearchCV(
    estimator       = modelo,
    param_distributions = param_distribuicoes,
    n_iter          = 100, #2
    scoring         = scoring,
    refit           = 'f1',
    cv              = kfold,
    verbose         = 1,
    n_jobs          = -1
)


random_search.fit(X_trn, y_trn)

print(f"Melhores hiperparâmetros encontrados: \n{random_search.best_params_}")

y_pred = random_search.best_estimator_.predict(X_tst)

acc   = accuracy_score(y_tst, y_pred)
rec   = recall_score(y_tst, y_pred)
f1    = f1_score(y_tst, y_pred)

print(f"Acuracia: {acc * 100:.2f}%")
print(f"Recall:   {rec * 100:.2f}%")
print(f"F1: {f1 * 100:.2f}%")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Melhores hiperparâmetros encontrados: 
{'bootstrap': False, 'max_depth': 26, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 317}
Acuracia: 95.30%
Recall:   97.58%
F1: 95.40%


## Armazenamento do df, scaler e modelo

In [None]:
import joblib
joblib.dump(scaler, 'scaler_variaveis.pkl')
df_stroke.to_excel("df_stroke.xlsx", index=False)
joblib.dump(random_search.best_estimator_, 'modelo_otimizado.pkl')
joblib.dump(random_search.best_estimator_, 'modelo_otimizado_comprimido.pkl', compress=3)

['modelo_otimizado_comprimido.pkl']