In [1]:
import numpy as np
import pandas as pd
import os
import warnings
from sklearn.model_selection import train_test_split
from ClassificadorAlfa import ClassificadorAlfa
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(os.path.join('data', 'healthcare-dataset-stroke-data.csv'))

# Hipotese 1

In [3]:
# Separando target e features
try:
    df.drop(['id', 'bmi', 'age', 'avg_glucose_level'], axis=1, inplace=True)
except:
    pass

X = df.drop('stroke', axis=1)
y = df['stroke']

In [4]:
# Obtendo variáveis categoricas
object_features = [feature for feature in X.columns if X[feature].dtype == 'O']
int_features = [feature for feature in X.columns if X[feature].dtype == 'int64']
categorical_features = object_features + int_features

In [5]:
# Obtendo variáveis dummy
X = X[categorical_features]

X = pd.get_dummies(X)
X.hypertension = X.hypertension.astype('bool')
X.heart_disease = X.heart_disease.astype('bool')

X_names = X.columns

y = y.replace(0,-1)

X = X.to_numpy()
y = y.to_numpy()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
X_train = X_train.astype('float64')
y_train = y_train.astype('float64')
X_test = X_test.astype('float64')
y_test = y_test.astype('float64')

In [7]:
# Inicializando os parâmetros do modelo
a = np.random.randn(X_train.shape[1], 1)
b = 1.0

X_train = X_train.T
y_train = y_train.T

parametros = [a, b, X_train, y_train]
learning_rate = 0.0001
num_iteracoes = 50000

In [8]:
classificador = ClassificadorAlfa(learning_rate, num_iteracoes, parametros)

In [9]:
# Treinando o modelo
a, b = classificador.treinar()
a, b

KeyboardInterrupt: 

In [None]:
# Fazendo previsões
ypred = a.T @ X_test.T + b
ypred

In [None]:
acuracia = ClassificadorAlfa.acuracia(y_test, ypred)
print(f'A acurácia do modelo foi de {acuracia*100:.2f}%')

In [None]:
# Selecionando as features mais importantes
features = X_names
importances = pd.DataFrame(data=a, index=features, columns=['importance']).sort_values(by='importance', ascending=False)
importances