In [None]:
# Update sklearn to prevent version mismatches
!conda install scikit-learn
!conda update scikit-learn
!conda install joblib 
!conda update joblib 

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
heart_df = pd.read_csv("../Data/heart_clean.csv")
# Drop the null columns where all values are null
heart_df = heart_df.dropna(axis='columns', how='all')
# Drop the null rows
heart_df = heart_df.dropna()
heart_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,old_peak,slope,number_major_vessels,thalassemia,diagnosis
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Create a Train Test Split
 

In [7]:
from sklearn.model_selection import train_test_split

X = heart_df.drop(columns=["diagnosis"])
y = heart_df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

# Pre-processing

Scale the data using the MinMaxScaler

In [8]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model

In [9]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model2 = SVC(kernel='linear')
model2.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [10]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8370044052863436
Testing Data Score: 0.8289473684210527


# Hyperparameter Tuning

Use `GridSearchCV` to tune model's parameters

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model2, param_grid, verbose=3)

In [12]:
grid.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .................C=1, gamma=0.0001;, score=0.717 total time=   0.0s
[CV 2/5] END .................C=1, gamma=0.0001;, score=0.717 total time=   0.0s
[CV 3/5] END .................C=1, gamma=0.0001;, score=0.889 total time=   0.0s
[CV 4/5] END .................C=1, gamma=0.0001;, score=0.844 total time=   0.0s
[CV 5/5] END .................C=1, gamma=0.0001;, score=0.867 total time=   0.0s
[CV 1/5] END ..................C=1, gamma=0.001;, score=0.717 total time=   0.0s
[CV 2/5] END ..................C=1, gamma=0.001;, score=0.717 total time=   0.0s
[CV 3/5] END ..................C=1, gamma=0.001;, score=0.889 total time=   0.0s
[CV 4/5] END ..................C=1, gamma=0.001;, score=0.844 total time=   0.0s
[CV 5/5] END ..................C=1, gamma=0.001;, score=0.867 total time=   0.0s
[CV 1/5] END ...................C=1, gamma=0.01;, score=0.717 total time=   0.0s
[CV 2/5] END ...................C=1, gamma=0.01;,

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [13]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 5, 'gamma': 0.0001}
0.8155555555555555


# Save the Model

In [14]:
# save the fitted model to file
import joblib
filename = 'svm.sav'
joblib.dump(grid, filename)

['svm.sav']