In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv("heart disease classification dataset.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0,63,male,3,145.0,233.0,1,0,150.0,0,2.3,0,0,1,yes
1,1,37,male,2,130.0,250.0,0,1,187.0,0,3.5,0,0,2,yes
2,2,41,female,1,130.0,204.0,0,0,172.0,0,1.4,2,0,2,yes
3,3,56,male,1,120.0,236.0,0,1,178.0,0,0.8,2,0,2,yes
4,4,57,female,0,,354.0,0,1,163.0,1,0.6,2,0,2,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,298,57,female,0,140.0,241.0,0,1,123.0,1,0.2,1,0,3,no
299,299,45,male,3,110.0,264.0,0,1,132.0,0,1.2,1,0,3,no
300,300,68,male,0,144.0,193.0,1,1,141.0,0,3.4,1,2,3,no
301,301,57,male,0,,131.0,0,1,115.0,1,1.2,1,1,3,no


In [4]:
numeric_columns = data.select_dtypes(include='number')
data[numeric_columns.columns] = data[numeric_columns.columns].fillna(numeric_columns.mean())

In [5]:
# Check for missing values in all columns
missing_values = data.isnull().sum()

# Display columns with missing values, if any
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [6]:
# Convert 'sex' column to numerical format (0 for female, 1 for male)
data['sex'] = data['sex'].map({'female': 0, 'male': 1})

# Convert 'target' column to numerical format (0 for no heart disease, 1 for heart disease)
data['target'] = data['target'].map({'no': 0, 'yes': 1})

# Display the first few rows of the dataset after transformation
print(data.head())

   Unnamed: 0  age  sex  cp    trestbps   chol  fbs  restecg  thalach  exang  \
0           0   63    1   3  145.000000  233.0    1        0    150.0      0   
1           1   37    1   2  130.000000  250.0    0        1    187.0      0   
2           2   41    0   1  130.000000  204.0    0        0    172.0      0   
3           3   56    1   1  120.000000  236.0    0        1    178.0      0   
4           4   57    0   0  131.712375  354.0    0        1    163.0      1   

   oldpeak  slope  ca  thal  target  
0      2.3      0   0     1       1  
1      3.5      0   0     2       1  
2      1.4      2   0     2       1  
3      0.8      2   0     2       1  
4      0.6      2   0     2       1  


In [7]:
# Remove the 'Unnamed: 0' column
data.drop('Unnamed: 0', axis=1, inplace=True)

# Display the columns after removal
print(data.columns)

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [8]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Normalize the data
data_normalized = scaler.fit_transform(data)
data_normalized = pd.DataFrame(data_normalized, columns=data.columns)
print(data_normalized.head())

        age  sex        cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.708333  1.0  1.000000  0.481132  0.244292  1.0      0.0  0.603053    0.0   
1  0.166667  1.0  0.666667  0.339623  0.283105  0.0      0.5  0.885496    0.0   
2  0.250000  0.0  0.333333  0.339623  0.178082  0.0      0.0  0.770992    0.0   
3  0.562500  1.0  0.333333  0.245283  0.251142  0.0      0.5  0.816794    0.0   
4  0.583333  0.0  0.000000  0.355777  0.520548  0.0      0.5  0.702290    1.0   

    oldpeak  slope   ca      thal  target  
0  0.370968    0.0  0.0  0.333333     1.0  
1  0.564516    0.0  0.0  0.666667     1.0  
2  0.225806    1.0  0.0  0.666667     1.0  
3  0.129032    1.0  0.0  0.666667     1.0  
4  0.096774    1.0  0.0  0.666667     1.0  


In [9]:

pip install --upgrade scikit-learn imbalanced-learn


Note: you may need to restart the kernel to use updated packages.


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

# Assuming 'data' is your dataset
# Split the data into features (X) and target variable (y)
selected_features = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope']
X = data[selected_features]
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression classifier
logistic_classifier = LogisticRegression()

# Fit the classifier on the training data
logistic_classifier.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred = logistic_classifier.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Classifier Accuracy:", accuracy)

# Classification report
print("Classification Report for Logistic Regression Classifier:")
print(classification_report(y_test, y_pred))


Logistic Regression Classifier Accuracy: 0.8360655737704918
Classification Report for Logistic Regression Classifier:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        29
           1       0.87      0.81      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61



In [11]:
# Prompt the user for input values for each feature
input_age = float(input("Enter age: "))
input_sex = int(input("Enter sex (0 for female, 1 for male): "))
input_cp = int(input("Enter chest pain type (0-3): "))
input_trestbps = float(input("Enter resting blood pressure: "))
input_chol = float(input("Enter serum cholestoral in mg/dl: "))
input_fbs = int(input("Enter fasting blood sugar > 120 mg/dl (1 for yes, 0 for no): "))
input_restecg = int(input("Enter resting electrocardiographic results (0-2): "))
input_thalach = float(input("Enter maximum heart rate achieved: "))
input_exang = int(input("Enter exercise induced angina (1 for yes, 0 for no): "))
input_oldpeak = float(input("Enter ST depression induced by exercise relative to rest: "))
input_slope = int(input("Enter the slope of the peak exercise ST segment (0-2): "))

# Create a DataFrame with the input data
input_data = pd.DataFrame({
    'age': [input_age],
    'sex': [input_sex],
    'cp': [input_cp],
    'trestbps': [input_trestbps],
    'chol': [input_chol],
    'fbs': [input_fbs],
    'restecg': [input_restecg],
    'thalach': [input_thalach],
    'exang': [input_exang],
    'oldpeak': [input_oldpeak],
    'slope': [input_slope]
})

# Scale the input data using the same scaler as the training data
input_data_scaled = scaler.transform(input_data)

# Make predictions using the trained Logistic Regression classifier
prediction = logistic_classifier.predict(input_data_scaled)

# Output the prediction result
if prediction[0] == 1:
    print("The predicted outcome is YES.")
else:
    print("The predicted outcome is NO.")


Enter age: 52
Enter sex (0 for female, 1 for male): 1
Enter chest pain type (0-3): 0
Enter resting blood pressure: 2
Enter serum cholestoral in mg/dl: 122
Enter fasting blood sugar > 120 mg/dl (1 for yes, 0 for no): 1
Enter resting electrocardiographic results (0-2): 0
Enter maximum heart rate achieved: 255
Enter exercise induced angina (1 for yes, 0 for no): 1
Enter ST depression induced by exercise relative to rest: 1
Enter the slope of the peak exercise ST segment (0-2): 2
The predicted outcome is YES.
