In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# models
from sklearn.linear_model import LogisticRegression, Perceptron, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
data = pd.read_csv("cardio_train.csv", sep=";")

#dropping the id column and making new feature called bmi using feature engineering
data.drop("id",axis=1,inplace=True)
data.drop_duplicates(inplace=True)
data["bmi"] = data["weight"] / (data["height"]/100)**2
out_filter = ((data["ap_hi"]>250) | (data["ap_lo"]>200))
data = data[~out_filter]

#changing the age from days to year format
data.loc[:,"age"] = (data["age"]/365).astype(int)

#filtering out ap_hi and ap_lo
out_filter2 = ((data["ap_hi"] < 0) | (data["ap_lo"] < 0))
data = data[~out_filter2]

#seperating target for the training and testing dataset
target_name = 'cardio'
data_target = data[target_name]
data = data.drop([target_name], axis=1)

# seperating test and training set
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

#%% split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, target, test_size=0.2, random_state=0)


In [None]:
# Logistic Regression

logreg = LogisticRegression(max_iter = 2000)
logreg.fit(train, target)



In [None]:
import joblib

In [None]:
joblib.dump(logreg,"logreg_model.pkl")

['logreg_model.pkl']

In [None]:
train.head(3)

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
31684,58,2,160,76.0,130,80,1,1,0,1,1,29.6875
5634,40,2,184,70.0,120,80,1,1,0,0,1,20.675803
58675,64,1,158,76.0,120,80,1,1,0,0,1,30.443839


In [None]:
train.iloc[[1]]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi
5634,40,2,184,70.0,120,80,1,1,0,0,1,20.675803


In [None]:
40,2,184,70.0,120,80,1,1,0,0,1,20.675803

In [None]:
logreg.predict(train.iloc[[0]])

array([1])

In [None]:
model = joblib.load("logreg_model.pkl")

In [None]:
model.predict(train.iloc[[1]])

array([0])

In [None]:
# Support Vector Machines

svc = SVC()
svc.fit(train, target)
acc_svc = round(svc.score(train, target) * 100, 2)
acc_svc

72.23

In [None]:
acc_test_svc = round(svc.score(test, target_test) * 100, 2)
acc_test_svc

72.39

In [None]:
joblib.dump(svc,"svc_model.pkl")

['svc_model.pkl']

In [None]:
svc.predict(train.iloc[[0]])

array([1])

In [None]:
model = joblib.load("svc_model.pkl")

In [None]:
model.predict(train.iloc[[2]])

array([1])

In [None]:
# k-Nearest Neighbors algorithm

knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid={'n_neighbors': [2, 3]}, cv=10).fit(train, target)
acc_knn = round(knn.score(train, target) * 100, 2)
print(acc_knn, knn.best_params_)

81.3 {'n_neighbors': 3}


In [None]:
acc_test_knn = round(knn.score(test, target_test) * 100, 2)
acc_test_knn

67.21

In [None]:
joblib.dump(knn,"knn_model.pkl")

['knn_model.pkl']

In [None]:
model = joblib.load("knn_model.pkl")
model.predict(train.iloc[[2]])

array([1])

In [None]:
# Decision Tree Classifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(train, target)
acc_decision_tree = round(decision_tree.score(train, target) * 100, 2)
acc_decision_tree

97.99

In [None]:
acc_test_decision_tree = round(decision_tree.score(test, target_test) * 100, 2)
acc_test_decision_tree

63.97

In [None]:
joblib.dump(decision_tree,"decision_tree_model.pkl")

['decision_tree_model.pkl']

In [None]:
model = joblib.load("decision_tree_model.pkl")
model.predict(test.iloc[[7]])

array([1])

In [None]:
decision_tree.predict(test.iloc[[7]])

array([1])

In [None]:
# Random Forest

random_forest = GridSearchCV(estimator=RandomForestClassifier(), param_grid={'n_estimators': [100, 300]}, cv=5).fit(train, target)
random_forest.fit(train, target)
acc_random_forest = round(random_forest.score(train, target) * 100, 2)
print(acc_random_forest,random_forest.best_params_)

97.99 {'n_estimators': 300}


In [None]:
joblib.dump(random_forest,"random_forest_model.pkl")

['random_forest_model.pkl']

In [None]:
model = joblib.load("random_forest_model.pkl")
model.predict(test.iloc[[7]])

array([1])

In [None]:
acc_test_random_forest = round(random_forest.score(test, target_test) * 100, 2)
acc_test_random_forest

71.47

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBClassifier
from hyperopt import fmin, tpe, hp, space_eval

# Load your data
data = pd.read_csv("cardio_train.csv", sep=";")

# Dropping the id column and making a new feature called bmi using feature engineering
data.drop("id", axis=1, inplace=True)
data.drop_duplicates(inplace=True)
data["bmi"] = data["weight"] / (data["height"] / 100) ** 2
out_filter = ((data["ap_hi"] > 250) | (data["ap_lo"] > 200))
data = data[~out_filter]

# Changing the age from days to year format
data.loc[:, "age"] = (data["age"] / 365).astype(int)

# Filtering out ap_hi and ap_lo
out_filter2 = ((data["ap_hi"] < 0) | (data["ap_lo"] < 0))
data = data[~out_filter2]

# Separating target for the training and testing dataset
target_name = 'cardio'
data_target = data[target_name]
data = data.drop([target_name], axis=1)

# Separating test and training set
train, test, target, target_test = train_test_split(data, data_target, test_size=0.2, random_state=0)

# Define the hyperparameter optimization function
def hyperopt_xgb_score(params):
    clf = XGBClassifier(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return -current_score  # Minimize the negative score

# Define the hyperparameter space
space_xgb = {
    'learning_rate': hp.quniform('learning_rate', 0.01, 0.3, 0.01),
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'eta': hp.quniform('eta', 0.025, 0.5, 0.005),
    'max_depth': hp.choice('max_depth', np.arange(2, 12, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 9, 0.025),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'exact',
    'verbosity': 1,
    'missing': None
}

# Run hyperparameter optimization
best = fmin(fn=hyperopt_xgb_score, space=space_xgb, algo=tpe.suggest, max_evals=10)
print('Best hyperparameters:')
print(best)

# Evaluate the best hyperparameters
params = space_eval(space_xgb, best)
print(params)

# Train the final model with the best hyperparameters
XGB_Classifier = XGBClassifier(**params)
XGB_Classifier.fit(train, target)

# Evaluate the model
acc_XGB_Classifier = round(XGB_Classifier.score(train, target) * 100, 2)
acc_test_XGB_Classifier = round(XGB_Classifier.score(test, target_test) * 100, 2)

print(f"Training Accuracy: {acc_XGB_Classifier}%")
print(f"Test Accuracy: {acc_test_XGB_Classifier}%")


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1565, in predict
    class_probs = super().predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1186, in predict
    predts = self.get_booster().inplace_predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 2542, in inplace_predict
    _check_call(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 284, in _check_call
    raise XGBoostError(

nan
{'booster': 'gbtree', 'colsample_bytree': 0.7000000000000001, 'eta': 0.28, 'eval_metric': 'auc', 'gamma': 0.8, 'learning_rate': 0.27, 'max_depth': 10, 'min_child_weight': 8.325000000000001, 'missing': None, 'n_estimators': 123, 'objective': 'binary:logistic', 'subsample': 0.65, 'tree_method': 'exact', 'verbosity': 1}
 10%|█         | 1/10 [01:32<13:52, 92.55s/trial, best loss=?]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1565, in predict
    class_probs = super().predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1186, in predict
    predts = self.get_booster().inplace_predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 2542, in inplace_predict
    _check_call(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 284, in _check_call
    raise XGBoostError(

nan
{'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.07, 'eval_metric': 'auc', 'gamma': 0.8, 'learning_rate': 0.28, 'max_depth': 8, 'min_child_weight': 3.2, 'missing': None, 'n_estimators': 368, 'objective': 'binary:logistic', 'subsample': 0.75, 'tree_method': 'exact', 'verbosity': 1}
 20%|██        | 2/10 [05:00<21:22, 160.27s/trial, best loss=?]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1565, in predict
    class_probs = super().predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1186, in predict
    predts = self.get_booster().inplace_predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 2542, in inplace_predict
    _check_call(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 284, in _check_call
    raise XGBoostError(

nan
{'booster': 'gbtree', 'colsample_bytree': 0.75, 'eta': 0.095, 'eval_metric': 'auc', 'gamma': 0.9500000000000001, 'learning_rate': 0.08, 'max_depth': 8, 'min_child_weight': 4.875, 'missing': None, 'n_estimators': 206, 'objective': 'binary:logistic', 'subsample': 0.6000000000000001, 'tree_method': 'exact', 'verbosity': 1}
 30%|███       | 3/10 [06:59<16:31, 141.69s/trial, best loss=?]

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1565, in predict
    class_probs = super().predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1186, in predict
    predts = self.get_booster().inplace_predict(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 2542, in inplace_predict
    _check_call(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 284, in _check_call
    raise XGBoostError(

 30%|███       | 3/10 [12:43<29:40, 254.39s/trial, best loss=?]


KeyboardInterrupt: 