In [58]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [59]:
# Load the dataset
url = "https://raw.githubusercontent.com/dataprofessor/data/master/heart-disease-cleveland.csv"
data = pd.read_csv(url)

In [60]:
# Inspect the dataset
# Convert all columns to numeric, setting invalid entries to NaN
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

data = data.dropna()
print("Dataset Overview:")
print(data.head())


Dataset Overview:
   age   sex   cp   trestbps   chol   fbs   restecg   thalach   exang  \
0   63     1    1        145    233     1         2       150       0   
1   67     1    4        160    286     0         2       108       1   
2   67     1    4        120    229     0         2       129       1   
3   37     1    3        130    250     0         0       187       0   
4   41     0    2        130    204     0         2       172       0   

    oldpeak   slope   ca   thal   diagnosis  
0       2.3       3  0.0    6.0           0  
1       1.5       2  3.0    3.0           2  
2       2.6       2  2.0    7.0           1  
3       3.5       3  0.0    3.0           0  
4       1.4       1  0.0    3.0           0  


In [61]:
print("\nSummary:")
print(data.info())


Summary:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         297 non-null    int64  
 1    sex        297 non-null    int64  
 2    cp         297 non-null    int64  
 3    trestbps   297 non-null    int64  
 4    chol       297 non-null    int64  
 5    fbs        297 non-null    int64  
 6    restecg    297 non-null    int64  
 7    thalach    297 non-null    int64  
 8    exang      297 non-null    int64  
 9    oldpeak    297 non-null    float64
 10   slope      297 non-null    int64  
 11   ca         297 non-null    float64
 12   thal       297 non-null    float64
 13   diagnosis  297 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 34.8 KB
None


In [62]:
# Data Preprocessing
# Check for missing values
print("\nMissing values per column:")
print(data.isnull().sum())
#Trim columnn nanmes
data.columns = data.columns.to_series().apply(lambda x: x.strip())
#Simplify Diagnosis column
data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x > 0 else 0)


Missing values per column:
age           0
 sex          0
 cp           0
 trestbps     0
 chol         0
 fbs          0
 restecg      0
 thalach      0
 exang        0
 oldpeak      0
 slope        0
 ca           0
 thal         0
 diagnosis    0
dtype: int64


In [63]:
# Feature Scaling
scaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

In [64]:
# Split the dataset
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)


In [66]:
# Evaluate the model
y_pred = model.predict(X_test)

In [67]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.84      0.88      0.86        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60



In [69]:
#Save the model
with open('heart_disease_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModel saved as 'heart_disease_model.pkl'")


Model saved as 'heart_disease_model.pkl'
