In [36]:
import numpy as np
import pandas as pd
import os
import warnings
from sklearn.model_selection import train_test_split
from ClassificadorAlfa import ClassificadorAlfa
warnings.filterwarnings('ignore')

In [37]:
df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

# Hipotese 1

In [38]:
# Separando target e features
try:
    df.drop(['id', 'bmi', 'age', 'avg_glucose_level'], axis=1, inplace=True)
except:
    pass

X = df.drop('stroke', axis=1)
y = df['stroke']

In [39]:
# Obtendo variáveis categoricas
object_features = [feature for feature in X.columns if X[feature].dtype == 'O']
int_features = [feature for feature in X.columns if X[feature].dtype == 'int64']
categorical_features = object_features + int_features

In [40]:
# Obtendo variáveis dummy
X = X[categorical_features]

X = pd.get_dummies(X)
X.hypertension = X.hypertension.astype('bool')
X.heart_disease = X.heart_disease.astype('bool')

X_names = X.columns

y = y.replace(0,-1)

X = X.to_numpy()
y = y.to_numpy()

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
X_train = X_train.astype('float64')
y_train = y_train.astype('float64')
X_test = X_test.astype('float64')
y_test = y_test.astype('float64')

In [42]:
# Inicializando os parâmetros do modelo
a = np.random.randn(X_train.shape[1], 1)
b = 1.0

X_train = X_train.T
y_train = y_train.T

parametros = [a, b, X_train, y_train]
learning_rate = 0.0001
num_iteracoes = 50000

In [43]:
classificador = ClassificadorAlfa(learning_rate, num_iteracoes, parametros)

In [44]:
# Treinando o modelo
a, b = classificador.treinar()
a, b

(array([[ 0.59612979],
        [-0.49292033],
        [ 0.00366748],
        [ 0.05842838],
        [-0.45523878],
        [ 0.16514722],
        [-0.00106887],
        [-0.37409518],
        [ 0.96053848],
        [-0.63548678],
        [-0.68157617],
        [-0.9380222 ],
        [ 0.14410846],
        [ 0.15816368],
        [-0.1303328 ],
        [ 0.0600806 ],
        [-0.15685783],
        [-0.08046018]]),
 -0.41852987475703685)

In [48]:
# Fazendo previsões
ypred = a.T @ X_test.T + b
ypred

array([[-1.14572585, -0.87926316, -0.97909826, ..., -0.8714263 ,
        -1.11920083, -0.74801492]])

In [46]:
acuracia = ClassificadorAlfa.acuracia(y_test, ypred)
print(f'A acurácia do modelo foi de {acuracia*100:.2f}%')

A acurácia do modelo foi de 94.05%


In [49]:
# Selecionando as features mais importantes
features = X_names
importances = pd.DataFrame(data=np.abs(a), index=features, columns=['importance']).sort_values(by='importance', ascending=False)
importances

Unnamed: 0,importance
work_type_Never_worked,0.960538
work_type_children,0.938022
work_type_Self-employed,0.681576
work_type_Private,0.635487
hypertension,0.59613
heart_disease,0.49292
gender_Other,0.455239
work_type_Govt_job,0.374095
ever_married_No,0.165147
Residence_type_Urban,0.158164
