In [7]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
# Load in data
heart_disease_df = pd.read_csv('../heart_disease_2.csv')
heart_disease_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Check the distribution of our target variable
heart_disease_df['HeartDisease'].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [4]:
# Encode target variable with Scikit Learn
object_columns = heart_disease_df.dtypes[heart_disease_df.dtypes == "object"].index.tolist()
le = LabelEncoder()
encoded_df = heart_disease_df.copy()

for column in object_columns:
    encoded_df[column] = le.fit_transform(encoded_df[column])
    
encoded_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [5]:
# Define the target and features
y = encoded_df['HeartDisease'].ravel()

X = encoded_df.drop('HeartDisease', axis = 1)

In [6]:
# Split in to test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 22)

In [8]:
# Create the model
model = LogisticRegression(solver = 'lbfgs', random_state = 22)

In [9]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=22)

In [10]:
# Make predictions using the model
predictions = model.predict(X_test)

In [11]:
# Calculate confusion matrix
matrix = confusion_matrix(y_test, predictions)
matrix

array([[ 82,  17],
       [ 16, 115]], dtype=int64)

In [12]:
# Calculate accuracy score
accuracy_score = accuracy_score(y_test, predictions)
print(f"Accuracy score: {accuracy_score}")

Accuracy score: 0.8565217391304348


In [13]:
classification_report = classification_report(y_test, predictions)
print(classification_report)

              precision    recall  f1-score   support

           0       0.84      0.83      0.83        99
           1       0.87      0.88      0.87       131

    accuracy                           0.86       230
   macro avg       0.85      0.85      0.85       230
weighted avg       0.86      0.86      0.86       230

