# Supervised Learning and Machine Learning Algorithms

In [4]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('./heart_failure_clinical_records_dataset.csv') #make sure to replace with your data directory
df.drop(columns=['time'],inplace=True) #drop the time column which is not one of the informative features 
df_features = df.iloc[:,:-1]
df_target = df['DEATH_EVENT']

X = df_features
y = df_target

In [5]:
X

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0
...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1


In [6]:
y

0      1
1      1
2      1
3      1
4      1
      ..
294    0
295    0
296    0
297    0
298    0
Name: DEATH_EVENT, Length: 299, dtype: int64

In [7]:
y.value_counts()

DEATH_EVENT
0    203
1     96
Name: count, dtype: int64

# Create train and test sets

We should divide the dataset to train and test splits. We train our ML algorithms on train dataset and evaluate their performance on test set

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,stratify=y,random_state=21)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print('y train: ',y_train.value_counts(normalize=True))
print('y test: ',y_test.value_counts(normalize=True))

(209, 11)
(209,)
(90, 11)
(90,)
y train:  DEATH_EVENT
0    0.679426
1    0.320574
Name: proportion, dtype: float64
y test:  DEATH_EVENT
0    0.677778
1    0.322222
Name: proportion, dtype: float64


A model with random guess which always predict the majority group has 68% accuracy. 

# Normalize train and test sets separately

In [42]:
from scipy.stats import zscore
X_train = zscore(X_train)
X_test = zscore(X_test)

# Logistic regression Model 

1- Establish your model 

2- Fit your model 

3- Predict on test set

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))


Accuracy: 0.7111111111111111


# Random Forest

## Exercise: Train a Random Forest model on train set and identify the accuracy on test split

In [46]:
#Write your code here
from sklearn.ensemble import RandomForestClassifier




Accuracy: 0.7333333333333333


# Support Vector Machines

In [33]:
SVC?

[31mInit signature:[39m
SVC(
    *,
    C=[32m1.0[39m,
    kernel=[33m'rbf'[39m,
    degree=[32m3[39m,
    gamma=[33m'scale'[39m,
    coef0=[32m0.0[39m,
    shrinking=[38;5;28;01mTrue[39;00m,
    probability=[38;5;28;01mFalse[39;00m,
    tol=[32m0.001[39m,
    cache_size=[32m200[39m,
    class_weight=[38;5;28;01mNone[39;00m,
    verbose=[38;5;28;01mFalse[39;00m,
    max_iter=-[32m1[39m,
    decision_function_shape=[33m'ovr'[39m,
    break_ties=[38;5;28;01mFalse[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m     
C-Support Vector Classification.

The implementation is based on libsvm. The fit time scales at least
quadratically with the number of samples and may be impractical
beyond tens of thousands of samples. For large datasets
consider using :class:`~sklearn.svm.LinearSVC` or
:class:`~sklearn.linear_model.SGDClassifier` instead, possibly after a
:class:`~sklearn.kernel_approximation.Nystroem` transformer or
other :ref:`kernel

In [49]:
#Linear SVM
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit(X_train,y_train)
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.7


# Excercise: Run SVM for kernel based SVM (rbf)

In [51]:
#Write your code here


# K-Nearest Neighbor

In [54]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train,y_train)
preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.6666666666666666


# Dummy classifier

In [56]:
from sklearn.dummy import DummyClassifier
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train,y_train)
preds = dummy_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))


Accuracy: 0.6777777777777778


In [57]:
DummyClassifier?

[31mInit signature:[39m DummyClassifier(*, strategy=[33m'prior'[39m, random_state=[38;5;28;01mNone[39;00m, constant=[38;5;28;01mNone[39;00m)
[31mDocstring:[39m     
DummyClassifier makes predictions that ignore the input features.

This classifier serves as a simple baseline to compare against other more
complex classifiers.

The specific behavior of the baseline is selected with the `strategy`
parameter.

All strategies make predictions that ignore the input feature values passed
as the `X` argument to `fit` and `predict`. The predictions, however,
typically depend on values observed in the `y` parameter passed to `fit`.

Note that the "stratified" and "uniform" strategies lead to
non-deterministic predictions that can be rendered deterministic by setting
the `random_state` parameter if needed. The other strategies are naturally
deterministic and, once fit, always return the same constant prediction
for any value of `X`.

Read more in the :ref:`User Guide <dummy_estimators>`