## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Step 1: Load data

In [2]:
data = pd.read_csv("heart.csv")

In [3]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [4]:
data.shape

(1025, 14)

In [5]:
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [6]:
data.duplicated().sum()

723

In [7]:
data.drop_duplicates(inplace=True)

In [8]:
data.shape

(302, 14)

## Step 2: Separate Input and Output Features

In [9]:
X = data.drop("target", axis=1)
y = data.target

In [12]:
y.value_counts()

target
1    164
0    138
Name: count, dtype: int64

## Step 3: Train Test Split

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=43, test_size=0.20)

## Step 4: Model Selection

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
logr = LogisticRegression()

## Step 5: Model Fit

In [18]:
logr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
y_pred_train = logr.predict(X_train)

In [20]:
y_pred_train

array([1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0],
      dtype=int64)

### Manual Model Evolution

In [21]:
# Checking how may are correct by comparing the actual y_train and y_pred_train

count = 0
for t, p in zip(y_train, y_pred_train): 
    if t == p:
        count += 1
count

209

In [22]:
len(X_train)

241

In [23]:
count/len(X_train)

0.8672199170124482

## Step 6: Model Evaluation on Train Data

### Accuracy Score

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred_train)

0.8672199170124482

### Classification Report

In [25]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.89      0.79      0.84       107
           1       0.85      0.93      0.89       134

    accuracy                           0.87       241
   macro avg       0.87      0.86      0.86       241
weighted avg       0.87      0.87      0.87       241



### Confusion Metrix

In [26]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_pred_train))

[[ 85  22]
 [ 10 124]]


## Step 7: Predict on Test Data

In [27]:
y_pred_test = logr.predict(X_test)

In [28]:
y_pred_test

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1], dtype=int64)

## Model Evaluation on Test Data

In [29]:
count = 0
for t, p in zip(y_test, y_pred_test): 
    if t == p:
        count += 1
count

47

In [30]:
len(X_test)

61

In [31]:
count/len(X_test)

0.7704918032786885

In [32]:
accuracy_score(y_test, y_pred_test)

0.7704918032786885

In [33]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.95      0.58      0.72        31
           1       0.69      0.97      0.81        30

    accuracy                           0.77        61
   macro avg       0.82      0.77      0.76        61
weighted avg       0.82      0.77      0.76        61



In [34]:
print(confusion_matrix(y_test, y_pred_test))

[[18 13]
 [ 1 29]]


In [36]:
#Overfitt
overfitt = 0.8672199170124482 - 0.7704918032786885
print(overfitt)

0.09672811373375967


## Cross Validation

### Simple Hold Out Validation Cross Validation
  - Training Data
    
        - Partial Train Data
        - Validation Data

---
  - Testing Data

In [37]:
from sklearn.model_selection import cross_val_score, KFold

In [38]:
num_fold = 5
kf = KFold(n_splits=num_fold, shuffle=True, random_state=2345)

In [45]:
cross_val_results = cross_val_score(logr, X, y, cv=kf)

In [42]:
cross_val_results

array([0.86885246, 0.78688525, 0.88333333, 0.83333333, 0.85      ])

In [43]:
cross_val_results.mean()

0.8444808743169399

In [44]:
import warnings
warnings.filterwarnings('ignore')