In [1]:
from pathlib import Path
import os

path = Path(f'../input/')

In [2]:
import numpy as np
import pandas as pd

np.set_printoptions(linewidth=120)
pd.set_option('display.width', 120)

df = pd.read_csv(path/'heart_disease_uci.csv')
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,Female,VA Long Beach,asymptomatic,127.0,333.0,True,st-t abnormality,154.0,False,0.0,,,,1
916,917,62,Male,VA Long Beach,typical angina,,139.0,False,st-t abnormality,,,,,,,0
917,918,55,Male,VA Long Beach,asymptomatic,122.0,223.0,True,st-t abnormality,100.0,False,0.0,,,fixed defect,2
918,919,58,Male,VA Long Beach,asymptomatic,,385.0,True,lv hypertrophy,,,,,,,0


In [3]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [4]:
df.describe(include=['object'])

Unnamed: 0,sex,dataset,cp,fbs,restecg,exang,slope,thal
count,920,920,920,830,918,865,611,434
unique,2,4,4,2,3,2,3,3
top,Male,Cleveland,asymptomatic,False,normal,False,flat,normal
freq,726,304,496,692,551,528,345,196


In [5]:
target = 'num'

In [6]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.impute import SimpleImputer #, KNNImputer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [7]:
data_transformer = ColumnTransformer(transformers=[
    ("n0", StandardScaler(), ['age']),
    ("n1", make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), ['trestbps', 'chol', 'thalch', 'oldpeak']),
    ("n2", make_pipeline(SimpleImputer(strategy='constant', fill_value=0.0), StandardScaler()), ['ca']),
    ('c1', make_pipeline(SimpleImputer(strategy='constant', fill_value='unknown'), OneHotEncoder(drop='if_binary')), ['slope', 'thal']),
    ('c2', make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder()), ['fbs', 'exang', 'restecg']),
    ('c3', OneHotEncoder(drop='if_binary'), ['sex', 'dataset', 'cp'])
])

In [8]:
classifier_pipline = Pipeline(
    steps=[("transformer", data_transformer), 
           ("classifier",  LogisticRegression())])

In [9]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(df, test_size=0.2, stratify=df[target])

In [10]:
print(data_train.shape, data_test.shape)

(736, 16) (184, 16)


In [11]:
classifier_pipline.fit(data_train, data_train[target])

preds = classifier_pipline.predict(data_test)

print(classification_report(data_test[target], preds))

              precision    recall  f1-score   support

           0       0.78      0.82      0.80        82
           1       0.47      0.55      0.50        53
           2       0.05      0.05      0.05        22
           3       0.29      0.19      0.23        21
           4       0.50      0.17      0.25         6

    accuracy                           0.55       184
   macro avg       0.42      0.35      0.37       184
weighted avg       0.54      0.55      0.54       184



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
import xgboost

estimators = [
    ("Random_Forest", RandomForestClassifier()),
    ("Xgboost", xgboost.XGBClassifier()),
    ("SVM", LinearSVC()),
    
]

# построим классификатор используя стекинг и наш список классификаторов
stacking_classifier = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(n_jobs=-1, verbose=True),
    n_jobs=-1,
    verbose=True,
)

In [14]:
classifier_pipeline = Pipeline(
    steps=[("transformer", data_transformer),
           ("classifier", stacking_classifier)]
)

classifier_pipeline

In [18]:
classifier_pipeline.fit(data_train, data_train[target])

preds = classifier_pipeline.predict(data_test)

print(classification_report(data_test[target], preds))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           80     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.18455D+03    |proj g|=  3.10144D+02

At iterate   50    f=  6.64948D+02    |proj g|=  2.64004D-01

At iterate  100    f=  6.64913D+02    |proj g|=  2.20413D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   80    100    108      1     0     0   2.204D-02   6.649D+02
  F =   664.91322935261383     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
              precision    recall  f1-score   support

           0   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
import pickle

model_dir = Path('../models')

if not model_dir.exists():
    model_dir.mkdir()


with open(model_dir/"classifier_pipeline.pkl", "wb") as f:
    pickle.dump(classifier_pipeline, f)

In [23]:
with open(model_dir/"classifier_pipeline.pkl", 'rb') as f:
    pipeline_from_saved = pickle.load(f)

preds = pipeline_from_saved.predict(data_test)

print(classification_report(data_test[target], preds)) 

              precision    recall  f1-score   support

           0       0.76      0.79      0.77        82
           1       0.43      0.51      0.47        53
           2       0.33      0.32      0.33        22
           3       0.43      0.29      0.34        21
           4       0.00      0.00      0.00         6

    accuracy                           0.57       184
   macro avg       0.39      0.38      0.38       184
weighted avg       0.55      0.57      0.56       184



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
