# Imports

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import SelectFromModel

# Load & Process Data

In [2]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Omdena School/ML For Smart Health/mitbih_train.csv', header=None)
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Omdena School/ML For Smart Health/mitbih_test.csv', header=None)

In [3]:
# check for null values
print(f"empty values in train: {train_df.isna().sum().sum()}")
print(f"empty values in test: {test_df.isna().sum().sum()}")

empty values in train: 0
empty values in test: 0


In [4]:
# split the features from the labels
train_y = train_df.iloc[:,-1]
train_X = train_df.drop(train_df.columns[-1], axis=1)

test_y = test_df.iloc[:,-1]
test_X = test_df.drop(test_df.columns[-1], axis=1)

In [5]:
train_X.describe()
# we can see that the data has already been scaled between 0 and 1.
# we can also see that the mean values are very small for a lot (but not all) of the features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186
count,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,...,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0,87554.0
mean,0.89036,0.75816,0.423972,0.219104,0.201127,0.210399,0.205808,0.201773,0.198691,0.196757,0.198778,0.20355,0.208776,0.212885,0.218393,0.224966,0.231377,0.237123,0.242582,0.247923,0.253749,0.259748,0.266244,0.272734,0.279355,0.285588,0.291808,0.297672,0.303384,0.308795,0.31389,0.318454,0.322087,0.324891,0.326737,0.327817,0.327931,0.326746,0.324326,0.320537,...,0.031857,0.030729,0.029582,0.028531,0.027573,0.026875,0.025771,0.024823,0.023932,0.02306,0.022458,0.021809,0.021245,0.020623,0.020026,0.019534,0.019053,0.018592,0.017664,0.01674,0.015722,0.014402,0.013024,0.011625,0.010262,0.008929,0.008056,0.007278,0.006531,0.005981,0.005479,0.005025,0.004628,0.004291,0.003945,0.003681,0.003471,0.003221,0.002945,0.002807
std,0.240909,0.221813,0.227305,0.206878,0.177058,0.171909,0.178481,0.17724,0.171778,0.168357,0.171796,0.176496,0.180274,0.184101,0.186963,0.190002,0.193899,0.198465,0.202855,0.207166,0.211187,0.214821,0.21845,0.221486,0.224031,0.225848,0.227133,0.22783,0.228436,0.228871,0.22876,0.228393,0.227472,0.22618,0.224659,0.22311,0.221352,0.219258,0.216884,0.214168,...,0.118013,0.116516,0.114463,0.112686,0.111103,0.110513,0.108566,0.106493,0.104591,0.102658,0.101395,0.100201,0.099386,0.098327,0.096635,0.095729,0.095368,0.095055,0.092902,0.091118,0.088574,0.084638,0.079949,0.075307,0.069892,0.063994,0.060074,0.056404,0.05284,0.050006,0.046693,0.044154,0.042089,0.040525,0.038651,0.037193,0.036255,0.034789,0.032865,0.031924
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.921922,0.682486,0.250969,0.048458,0.082329,0.088416,0.073333,0.066116,0.065,0.068639,0.070543,0.069182,0.068293,0.067744,0.070175,0.072993,0.074803,0.075972,0.076923,0.077872,0.079442,0.081911,0.085938,0.090032,0.094595,0.098901,0.10396,0.109348,0.114754,0.12073,0.127946,0.135962,0.144295,0.151852,0.157895,0.161035,0.161133,0.159383,0.157343,0.155388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.991342,0.826013,0.429472,0.166,0.147878,0.158798,0.145324,0.144424,0.15,0.148734,0.145985,0.14859,0.152951,0.156863,0.162636,0.169399,0.174603,0.178095,0.182683,0.1875,0.194595,0.20332,0.21229,0.221656,0.230179,0.238224,0.24645,0.254588,0.262767,0.26964,0.27577,0.279006,0.281879,0.285412,0.288538,0.289701,0.289104,0.284314,0.27763,0.26938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.910506,0.578767,0.341727,0.258993,0.287628,0.298237,0.295391,0.290832,0.283636,0.287781,0.293367,0.303079,0.310992,0.316505,0.321809,0.328395,0.337449,0.347711,0.358127,0.36998,0.380402,0.390512,0.397552,0.404295,0.411821,0.421581,0.429688,0.434783,0.437034,0.437203,0.434868,0.430931,0.427273,0.426084,0.426699,0.428336,0.430604,0.432432,0.431188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# convert numeric labels to strings so that the models know it's multiclass not a continuous variable
train_y.replace([0.0, 1.0, 2.0, 3.0, 4.0], ["beat0", "beat1", "beat2", "beat3", "beat4"], inplace=True)
test_y.replace([0.0, 1.0, 2.0, 3.0, 4.0], ["beat0", "beat1", "beat2", "beat3", "beat4"], inplace=True)

In [7]:
# test how balanced the classes are and whether they are evenly distributed across train and test sets
print(f"train labels distribution:\n{train_y.value_counts(normalize=True)}")
print(f"\n\ntest labels distribution:\n{test_y.value_counts(normalize=True)}")

# we can see that they have been stratified


train labels distribution:
beat0    0.827729
beat4    0.073452
beat2    0.066108
beat1    0.025390
beat3    0.007321
Name: 187, dtype: float64


test labels distribution:
beat0    0.827608
beat4    0.073451
beat2    0.066143
beat1    0.025397
beat3    0.007400
Name: 187, dtype: float64


In [8]:
# Use a RandomForest Classifier to identify the 20 most important features. 
# Using all 187 makes training take a very long time and does not contribute much more accuracy
feature_selector = SelectFromModel(RandomForestClassifier(), max_features=20)
feature_selector.fit(train_X, train_y)
train_X = feature_selector.transform(train_X)
test_X = feature_selector.transform(test_X)

# Train and Evaluate Models

In [9]:
# get a quick baseline with Naive Bayes
model = MultinomialNB() 
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
print("confusion matrix: ")
confusion_matrix(test_y, y_predicted, labels=["beat0", "beat1", "beat2", "beat3", "beat4"])

accuracy score: 82.72

confusion matrix: 


array([[18102,     0,    16,     0,     0],
       [  556,     0,     0,     0,     0],
       [ 1442,     0,     6,     0,     0],
       [  162,     0,     0,     0,     0],
       [ 1607,     0,     1,     0,     0]])

In [10]:
# train the classifier and evaluate the performance in the test set
model = LogisticRegression(max_iter=1000)
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
print("confusion matrix: ")
confusion_matrix(test_y, y_predicted, labels=["beat0", "beat1", "beat2", "beat3", "beat4"])

accuracy score: 90.11

confusion matrix: 


array([[17876,    28,   179,     1,    34],
       [  405,   127,    24,     0,     0],
       [ 1017,     4,   403,     0,    24],
       [  138,     0,    24,     0,     0],
       [  267,     0,    20,     0,  1321]])

In [11]:
model = SVC() # default kernel = rbf
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
confusion_matrix(test_y, y_predicted)

accuracy score: 95.55



array([[18024,    57,    31,     0,     6],
       [  277,   275,     2,     0,     2],
       [  288,     2,  1150,     2,     6],
       [  123,     0,    12,    27,     0],
       [  158,     0,     8,     0,  1442]])

In [12]:
model = RandomForestClassifier()
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
print("confusion matrix: ")
confusion_matrix(test_y, y_predicted, labels=["beat0", "beat1", "beat2", "beat3", "beat4"])

accuracy score: 96.98

confusion matrix: 


array([[18043,    36,    32,     0,     7],
       [  250,   301,     4,     0,     1],
       [  155,     0,  1278,     8,     7],
       [   59,     0,    15,    88,     0],
       [   85,     0,     3,     0,  1520]])

In [13]:
model = GradientBoostingClassifier()
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
print("confusion matrix: ")
confusion_matrix(test_y, y_predicted, labels=["beat0", "beat1", "beat2", "beat3", "beat4"])

accuracy score: 95.67

confusion matrix: 


array([[17985,    48,    49,    11,    25],
       [  282,   260,     4,     0,    10],
       [  247,     5,  1170,    15,    11],
       [   87,     0,    14,    61,     0],
       [  131,     0,     8,     1,  1468]])

In [14]:
# normalize data to help gradient descent converge quicker
train_X = StandardScaler().fit_transform(train_X)
test_X = StandardScaler().fit_transform(test_X)

In [16]:
model = MLPClassifier(max_iter=500)
model.fit(train_X, train_y)
y_predicted= model.predict(test_X)
print(f"accuracy score: {(accuracy_score(test_y, y_predicted) * 100):.2f}\n")
print("confusion matrix: ")
confusion_matrix(test_y, y_predicted, labels=["beat0", "beat1", "beat2", "beat3", "beat4"])

accuracy score: 97.22

confusion matrix: 


array([[17943,    68,    54,    12,    41],
       [  209,   340,     3,     3,     1],
       [   97,     7,  1313,    15,    16],
       [   35,     0,    11,   116,     0],
       [   28,     4,     5,     0,  1571]])