In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score



In [21]:
df=pd.read_csv('heart_disease_uci.csv')

In [22]:
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [24]:
df.fillna(0,inplace=True)

In [25]:
# Convert categorical variables to numerical
for col in ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'num']:
    df[col] = pd.Categorical(df[col]).codes

In [26]:
# Define features and target variable
X = df.drop('num', axis=1)  # Features
y = df['num']  # Target variable

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:


# Initialize the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC()
}
X_train = X_train.drop('dataset', axis=1)
X_test = X_test.drop('dataset', axis=1)
# Train the models and evaluate on the test set
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    results[name] = accuracy

# Display the accuracy of each model
for name, accuracy in results.items():
    print(f'{name}: {accuracy:.4f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: 0.5543
Decision Tree: 0.5761
Random Forest: 0.6304
Gradient Boosting: 0.5978
Support Vector Machine: 0.5598


In [29]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('Feature scaling completed.')

Feature scaling completed.


In [31]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search for Random Forest
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                              param_grid=rf_param_grid, 
                              cv=5, 
                              scoring='accuracy', 
                              n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters for Random Forest: {rf_grid_search.best_params_}")
print(f"Best accuracy score for Random Forest: {rf_grid_search.best_score_:.4f}")

# Define the hyperparameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    'subsample': [0.5, 0.7, 1.0]
}

# Perform grid search for Gradient Boosting
gb_grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42),
                              param_grid=gb_param_grid,
                              cv=5,
                              scoring='accuracy',
                              n_jobs=-1)
gb_grid_search.fit(X_train_scaled, y_train)

print(f"Best parameters for Gradient Boosting: {gb_grid_search.best_params_}")
print(f"Best accuracy score for Gradient Boosting: {gb_grid_search.best_score_:.4f}")

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best accuracy score for Random Forest: 0.6453
Best parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.7}
Best accuracy score for Gradient Boosting: 0.6535


In [32]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize the CatBoost Classifier
catboost_model = CatBoostClassifier(verbose=0)

# Fit the model to the training data
catboost_model.fit(X_train, y_train)

# Predict on the test set
y_pred_catboost = catboost_model.predict(X_test)

# Calculate the accuracy
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)

print(f'CatBoost Model Accuracy: {accuracy_catboost}')

CatBoost Model Accuracy: 0.592391304347826
