In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("D:\Capstone\Databases\ML CSV\CUDB\Refined\TCI_TCSC_MEA_STE_MAV\CUDB_refined_without_TCI.csv")

In [None]:
df.isnull().sum()

TCSC      0
MAV       0
STE       0
MEA       0
Rhythm    0
dtype: int64

In [None]:
df["Rhythm"].value_counts()

Rhythm
0    2786
1     749
Name: count, dtype: int64

In [None]:
df.columns

Index(['TCSC', 'MAV', 'STE', 'MEA', 'Rhythm'], dtype='object')

# Random forest classifier

In [None]:
# Separate the data into two classes
class_0 = df[df["Rhythm"] == 0]
class_1 = df[df["Rhythm"] == 1]

# Undersample class 0 to match the number of samples in class 1
class_0_undersampled = class_0.sample(n=len(class_1), random_state=42)

# Concatenate the undersampled class 0 with class 1
balanced_df = pd.concat([class_0_undersampled, class_1])

# Shuffle the concatenated DataFrame to mix the rows
#balanced_df = balanced_df.sample(frac=1, random_state=42)

In [None]:
balanced_df.shape

(1498, 5)

In [None]:
print(balanced_df["Rhythm"].value_counts())

Rhythm
0    749
1    749
Name: count, dtype: int64


In [None]:
X = balanced_df.iloc[:, :-1]
y = balanced_df.iloc[:, -1]

In [None]:
y.value_counts()

Rhythm
0    749
1    749
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 100)

In [None]:
print(X_test.shape)
print(y_test.shape)
print(X_train.shape)
print(y_train.shape)

(300, 4)
(300,)
(1198, 4)
(1198,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
clf.feature_importances_

array([0.20523931, 0.28807358, 0.27256354, 0.23412357])

In [None]:
balanced_df.columns

Index(['TCSC', 'MAV', 'STE', 'MEA', 'Rhythm'], dtype='object')

In [None]:
y_pred_test = clf.predict(X_test)

In [None]:
y_pred_test.shape

(300,)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_test)

array([[124,  34],
       [ 24, 118]], dtype=int64)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred_test, y_test)

0.8066666666666666

In [None]:
#from sklearn.model_selection import cross_val_score
#cross_val_score(clf, X_train, y_train, cv = 10)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred_test, y_test))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       148
           1       0.83      0.78      0.80       152

    accuracy                           0.81       300
   macro avg       0.81      0.81      0.81       300
weighted avg       0.81      0.81      0.81       300



# Compare ML algorithms


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier

In [None]:
names = ["Nearest_Neighbors", "Linear_SVM", "Polynomial_SVM", "RBF_SVM", "Gaussian_Process",
"Gradient_Boosting", "Decision_Tree", "Extra_Trees", "Random_Forest", "Neural_Net", "AdaBoost",
"Naive_Bayes", "QDA", "SGD"]

classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(kernel="poly", degree=3, C=0.025),
SVC(kernel="rbf", C=1, gamma=2),
GaussianProcessClassifier(1.0 * RBF(1.0)),
GradientBoostingClassifier(n_estimators=100, learning_rate=1.0),
DecisionTreeClassifier(max_depth=5),
ExtraTreesClassifier(n_estimators=10, min_samples_split=2),
RandomForestClassifier(max_depth=5, n_estimators=100),
MLPClassifier(alpha=1, max_iter=1000),
AdaBoostClassifier(n_estimators=100),
GaussianNB(),
QuadraticDiscriminantAnalysis(),
SGDClassifier(loss="hinge", penalty="l2")]

In [None]:
# iterate over classifiers
scores = []
for name, clf in zip(names, classifiers):
  clf.fit(X_train, y_train)
  score = clf.score(X_test, y_test)
  scores.append(score)

In [None]:
scores

[0.7533333333333333,
 0.6833333333333333,
 0.66,
 0.5133333333333333,
 0.77,
 0.7733333333333333,
 0.79,
 0.7933333333333333,
 0.7766666666666666,
 0.5666666666666667,
 0.7833333333333333,
 0.73,
 0.7633333333333333,
 0.6066666666666667]

In [None]:
import seaborn as sns
df_score = pd.DataFrame()
df_score['name'] = names
df_score['score'] = scores
df_score

Unnamed: 0,name,score
0,Nearest_Neighbors,0.753333
1,Linear_SVM,0.683333
2,Polynomial_SVM,0.66
3,RBF_SVM,0.513333
4,Gaussian_Process,0.77
5,Gradient_Boosting,0.773333
6,Decision_Tree,0.79
7,Extra_Trees,0.793333
8,Random_Forest,0.776667
9,Neural_Net,0.566667
