## Importing data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/heart.csv')
o2 = pd.read_csv('data/o2Saturation.csv')

## Preprocessing

In [3]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

In [4]:
o2.columns

Index(['98.6'], dtype='object')

In [5]:
o2 = o2.rename(columns={"98.6": "o2Saturation"})
data['o2Saturation'] = o2

In [6]:
cont_columns = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
cat_columns = ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall', 'o2Saturation']

In [7]:
data1 = data
data1 = pd.get_dummies(data1, columns=cat_columns, drop_first=True)

In [8]:
X = data1.drop(['output'], axis=1)
y = data1['output']

In [9]:
scaler = RobustScaler()
X[cont_columns] = scaler.fit_transform(X[cont_columns])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
X_train.head()

Unnamed: 0,age,trtbps,chol,thalachh,oldpeak,sex_1,cp_1,cp_2,cp_3,fbs_1,...,caa_4,thall_1,thall_2,thall_3,o2Saturation_97.0,o2Saturation_97.1,o2Saturation_97.5,o2Saturation_98.0,o2Saturation_98.1,o2Saturation_98.6
62,-0.222222,-0.6,-0.850394,1.138462,-0.5,1,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
127,0.888889,1.1,0.582677,0.584615,-0.5,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
111,0.148148,1.0,-1.795276,0.615385,-0.375,1,0,1,0,1,...,0,0,0,1,0,0,1,0,0,0
287,0.148148,1.2,-0.125984,0.338462,-0.5,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
108,-0.37037,-0.5,0.062992,0.276923,0.1875,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0


## Selecting models

In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, mean_absolute_error

In [13]:
# SVM
model = SVC(random_state=1)
train_score = model.fit(X_train, y_train).score(X_train, y_train)
preds = model.predict(X_test)
print(f'Train accuracy score (SVM): {train_score}')
print(f'Test accuracy score (SVM): {accuracy_score(y_test, preds)}')

Train accuracy score (SVM): 0.9380165289256198
Test accuracy score (SVM): 0.7868852459016393


In [14]:
# Random Forest
model = RandomForestClassifier(random_state=1)
train_score = model.fit(X_train, y_train).score(X_train, y_train)
preds = model.predict(X_test)
print(f'Train accuracy score (Random Forest): {train_score}')
print(f'Test accuracy score (Random Forest): {accuracy_score(y_test, preds)}')

Train accuracy score (Random Forest): 1.0
Test accuracy score (Random Forest): 0.7704918032786885


## Selecting best parameters (C, gamma, degree)

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
model = SVC()

parameters = {
    'C': [0.1, 0.3, 0.5, 1.0, 3.0, 10.0, 15.0, 30.0],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5],
    'degree': [2, 3, 4, 5]
}

searcher = GridSearchCV(model, parameters)
searcher.fit(X_train, y_train)

print(f"Best params = {searcher.best_params_}")
print(f"Best score = {searcher.best_score_}")

Best params = {'C': 30.0, 'degree': 2, 'gamma': 0.01}
Best score = 0.8641156462585033


## Predictions

In [17]:
preds = searcher.predict(X_test)
print(f'Test accuracy: {accuracy_score(preds, y_test)}')

Test accuracy: 0.8360655737704918
