In [19]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn import preprocessing

In [2]:
df = pd.read_csv('../data/cleaned_heart.csv')

In [5]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140.0,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160.0,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130.0,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138.0,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150.0,195.0,0,Normal,122,N,0.0,Up,0


In [17]:
from sklearn.model_selection import train_test_split
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

In [36]:
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputer', KNNImputer(n_neighbors=5))])
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['ChestPainType','RestingECG','ST_Slope','Sex','FastingBS','ExerciseAngina']),
    ('numerical_preprocessing', numerical_preprocessing, ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']),
    ('scaler', StandardScaler(), ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak'])
])

In [39]:
pipeline_baseline = Pipeline([
    ('preprocess', preprocess),
    ('lr', LogisticRegression(max_iter=500))
])

In [40]:
pipeline_baseline.fit(X_train, y_train)

In [41]:
y_predicted = pipeline_baseline.predict(X_test)

In [45]:
#metrics from baseline
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
r2 = r2_score(y_test, y_predicted)
acc = accuracy_score(y_test, y_predicted)
recal = recall_score(y_test, y_predicted)
precision = precision_score(y_test, y_predicted)
print('r2 score:', r2)
print('accuracy score:', acc)
print('recall score:', recal)
print('precision score:', precision)

r2 score: 0.38148712737127377
accuracy score: 0.8461538461538461
recall score: 0.9024390243902439
precision score: 0.826302729528536


In [47]:
import pickle
pickle.dump(pipeline_baseline, open( "../models/baseline_pipeline", "wb" ) )