In [None]:
# Update sklearn to prevent version mismatches
!conda install scikit-learn
!conda update scikit-learn
!conda install joblib 
!conda update joblib 

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [3]:
heart_df = pd.read_csv("../Data/heart_clean.csv")
# Drop the null columns where all values are null
heart_df = df.dropna(axis='columns', how='all')
# Drop the null rows
heart_df = df.dropna()
heart_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,old_peak,slope,number_major_vessels,thalassemia,diagnosis
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Create a Train Test Split


In [4]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
X_train.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,old_peak,slope,number_major_vessels,thalassemia
70,54,1,2,120,258,0,0,147,0,0.4,1,0,3
42,45,1,0,104,208,0,0,148,1,3.0,1,0,2
239,35,1,0,126,282,0,0,156,1,0.0,2,0,3
37,54,1,2,150,232,0,0,165,0,1.6,2,0,3
159,56,1,1,130,221,0,0,163,0,0.0,2,0,3


In [None]:
y_train.head()

# Pre-processing

Scale the data using the MinMaxScaler

In [6]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model

In [7]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [8]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8325991189427313
Testing Data Score: 0.8421052631578947


# Hyperparameter Tuning
 

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
model = LogisticRegression(solver='liblinear')
grid = GridSearchCV(model, param_grid, verbose=3)

In [15]:
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...................C=1, penalty=l1;, score=0.783 total time=   0.0s
[CV 2/5] END ...................C=1, penalty=l1;, score=0.739 total time=   0.0s
[CV 3/5] END ...................C=1, penalty=l1;, score=0.889 total time=   0.0s
[CV 4/5] END ...................C=1, penalty=l1;, score=0.867 total time=   0.0s
[CV 5/5] END ...................C=1, penalty=l1;, score=0.889 total time=   0.0s
[CV 1/5] END ...................C=1, penalty=l2;, score=0.761 total time=   0.0s
[CV 2/5] END ...................C=1, penalty=l2;, score=0.739 total time=   0.0s
[CV 3/5] END ...................C=1, penalty=l2;, score=0.911 total time=   0.0s
[CV 4/5] END ...................C=1, penalty=l2;, score=0.844 total time=   0.0s
[CV 5/5] END ...................C=1, penalty=l2;, score=0.867 total time=   0.0s
[CV 1/5] END ...................C=5, penalty=l1;, score=0.739 total time=   0.0s
[CV 2/5] END ...................C=5, penalty=l1;,

GridSearchCV(estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']}, verbose=3)

In [16]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'penalty': 'l1'}
0.8332367149758454


# Save the Model

In [17]:
# save fitted model to file
import joblib
filename = 'logistic.sav'
joblib.dump(grid, filename)

['logistic.sav']