# Logistic Regression To Predict Heart Disease

## Libraries for ML

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data Loading and One Hot Encoding for Educational Level

In [11]:
dataset = pd.read_csv('framingham.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X)

[[  1.    39.     4.   ...  26.97  80.    77.  ]
 [  0.    46.     2.   ...  28.73  95.    76.  ]
 [  1.    48.     1.   ...  25.34  75.    70.  ]
 ...
 [  0.    48.     2.   ...  22.    84.    86.  ]
 [  0.    44.     1.   ...  19.16  86.      nan]
 [  0.    52.     2.   ...  21.47  80.   107.  ]]


In [5]:
print(y)

[0 0 0 ... 0 0 0]


In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
column_transformer = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [2])], remainder = 'passthrough')
X = np.array(column_transformer.fit_transform(X))

In [15]:
print(X)

[[  0.     0.     0.   ...  26.97  80.    77.  ]
 [  0.     1.     0.   ...  28.73  95.    76.  ]
 [  1.     0.     0.   ...  25.34  75.    70.  ]
 ...
 [  0.     1.     0.   ...  22.    84.    86.  ]
 [  1.     0.     0.   ...  19.16  86.      nan]
 [  0.     1.     0.   ...  21.47  80.   107.  ]]


## Handling Missing Data

In [16]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
X = imputer.fit_transform(X)

In [17]:
print(X)

[[  0.           0.           0.         ...  26.97        80.
   77.        ]
 [  0.           1.           0.         ...  28.73        95.
   76.        ]
 [  1.           0.           0.         ...  25.34        75.
   70.        ]
 ...
 [  0.           1.           0.         ...  22.          84.
   86.        ]
 [  1.           0.           0.         ...  19.16        86.
   81.96675325]
 [  0.           1.           0.         ...  21.47        80.
  107.        ]]


## Split Training And Test Data

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
print(len(X_train))

3390


In [20]:
print(len(X_test))

848


In [21]:
print(len(y_train))

3390


In [22]:
print(len(y_test))

848


## Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train[:, 3:] = standard_scaler.fit_transform(X_train[:, 3:])
X_test[:, 3:] = standard_scaler.fit_transform(X_test[:, 3:])

In [27]:
print(X_train)

[[ 0.          1.          0.         ...  0.53279803 -0.48515562
  -0.3422011 ]
 [ 0.          1.          0.         ... -0.41644442 -1.98967246
  -0.13567854]
 [ 1.          0.          0.         ... -1.04845932 -0.8194927
  -0.30089659]
 ...
 [ 0.          0.          0.         ... -0.00892903 -0.06723427
  -0.38350562]
 [ 0.          0.          1.         ...  0.16432602  0.35068707
   3.33390045]
 [ 1.          0.          0.         ...  2.13113686 -0.31798708
   0.07084402]]


In [28]:
print(X_test)

[[ 1.          0.          0.         ...  0.81252071 -0.09603072
  -0.22894988]
 [ 1.          0.          0.         ...  0.09254046 -0.50401694
   0.14280521]
 [ 0.          0.          1.         ...  0.21127404 -0.91200316
  -0.04307233]
 ...
 [ 1.          0.          0.         ...  1.09040782  0.3119555
  -0.66266415]
 [ 0.          1.          0.         ...  0.37548006 -1.31998938
  -0.29090906]
 [ 0.          1.          0.         ...  0.2744302  -0.91200316
  -1.28225596]]


## Model Creation and Fitting Training Data

In [29]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(random_state = 0)
regressor.fit(X_train, y_train)

LogisticRegression(random_state=0)

## Predicting Test Data

In [32]:
y_pred = regressor.predict(X_test)

## Regression Metrics

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8490566037735849