In [112]:
import pandas as pd
import numpy as np
import pywt
from scipy.signal import savgol_filter
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import re
from sklearn.metrics import confusion_matrix

In [113]:
annotation_classes_map = {
    'N': 'Normal beat',
    'L': 'Left bundle branch block beat',
    'R': 'Right bundle branch block beat',
    'V': 'Premature ventricular contraction',
    '/': 'Paced beat',
    'f': 'Fusion of paced and normal beat',
}

In [119]:
# Function to remove baseline wander using DWT
def remove_baseline_wander(signal, wavelet="db4", level=8):
    coeffs = pywt.wavedec(signal, wavelet, level=level)
    coeffs[-1] = np.zeros_like(coeffs[-1])
    return pywt.waverec(coeffs, wavelet)

# Function to smooth signal using Savitzky-Golay filter
def smooth_signal(signal, window_length=11, polyorder=3):
    return savgol_filter(signal, window_length, polyorder)

# Function to segment the ECG signal around the R-peaks
def segment_ecg_signal(ecg_signal, annotations_df, r_peak_offset=150, segment_length=251):
    segments = []
    for _, row in annotations_df.iterrows():
        r_peak = row['Sample #']
        start = r_peak - r_peak_offset
        end = start + segment_length
        if start >= 0 and end <= len(ecg_signal):
            segment = ecg_signal[start:end]
            segments.append(segment)
    return segments

# Function to extract features using DWT
def extract_dwt_features(segments, wavelet='db4', level=4):
    features = []
    for segment in segments:
        coeffs = pywt.wavedec(segment, wavelet, level=level)
        features.append(np.concatenate(coeffs))
    return features

# Parse annotations
def pars3e_annotations(annotations):
    annotation_pattern = re.compile(r'\s*(\d+:\d+\.\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(\(.*\))?')
    parsed_annotations = []
    for line in annotations:
        match = annotation_pattern.match(line)
        if match:
            time, sample_number, annotation_type, sub, chan, num, aux = match.groups()
            parsed_annotations.append((time, int(sample_number), annotation_type, int(sub), int(chan), int(num), aux))
    return pd.DataFrame(parsed_annotations, columns=['Time', 'Sample #', 'Type', 'Sub', 'Chan', 'Num', 'Aux'])

def parse_annotations(annotations):
    annotation_pattern = re.compile(r'\s*(\d+:\d+\.\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s*(\(.*\))?')
    parsed_annotations = []
    for line in annotations:
        match = annotation_pattern.match(line)
        if match:
            time, sample_number, annotation_type, sub, chan, num, aux = match.groups()
            # Map the annotation type to your classes of interest, or 'Other' if not found
            annotation_type_mapped = annotation_type if annotation_type in annotation_classes_map else 'Other' #annotation_classes_map.get(annotation_type, 'Other')
            parsed_annotations.append((time, int(sample_number), annotation_type_mapped, int(sub), int(chan), int(num), aux))
    return pd.DataFrame(parsed_annotations, columns=['Time', 'Sample #', 'Type', 'Sub', 'Chan', 'Num', 'Aux'])

# Function to only rename labels without a corresponding key in annotation_map
def rename_unmapped_labels(labels, annotation_map):
    # Check if the label is in annotation_map, if not replace it with 'Other'
    return np.array([label if label in annotation_classes_map else 'Other' for label in labels])

In [47]:
patient_ids = ['100', '101', '102', '103', '104', '105', '106', '107', '108', '109']

# Initialize empty lists to hold the aggregated features and labels
all_features = []
all_labels = []

for patient_id in patient_ids:
    # Load the ECG data and annotations for each patient
    ecg_data = pd.read_csv(f'./datasets/{patient_id}.csv')
    
    with open(f'./datasets/{patient_id}annotations.txt', 'r') as file:
        annotations = file.readlines()

    annotations_df = parse_annotations(annotations)
    ecg_data.columns = ecg_data.columns.str.strip("'")  # Adjust as needed
    ecg_data['ECG_baseline_removed'] = remove_baseline_wander(ecg_data['MLII'])  # Adjust the column name as needed
    ecg_data['ECG_smoothed'] = smooth_signal(ecg_data['ECG_baseline_removed'])
    ecg_segments = segment_ecg_signal(ecg_data['ECG_smoothed'], annotations_df)
    ecg_features = extract_dwt_features(ecg_segments)

    # Prepare the labels
    feature_matrix = np.array(ecg_features)
    labels = rename_unmapped_labels(annotations_df['Type'].values, annotation_classes_map)

    # Ensure that the number of labels matches the number of feature vectors
    min_length = min(len(labels), len(feature_matrix))
    feature_matrix = feature_matrix[:min_length]
    labels = labels[:min_length]

    # Append the features and labels to the aggregate lists
    all_features.append(feature_matrix)
    all_labels.append(labels)

# Combine all features and labels into single arrays
all_features = np.vstack(all_features)
all_labels = np.concatenate(all_labels)

# Data splitting
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.15, random_state=42)

# Data standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM training
svm_classifier = SVC(kernel='rbf', gamma='scale')
svm_classifier.fit(X_train_scaled, y_train)

# Prediction and evaluation
y_pred = svm_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# Generate the classification report
unique_labels = np.unique(np.concatenate((y_test, y_pred)))  # Unique labels in y_test and y_pred
target_names = [annotation_classes_map.get(label, 'Other') for label in unique_labels]  # Corresponding target names

classification_rep = classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names)

# Print accuracy and classification report
print(f'Accuracy: {accuracy}')
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9176007270524084
Classification Report:
                                   precision    recall  f1-score   support

                       Paced beat       0.89      0.95      0.92       804
    Left bundle branch block beat       0.99      0.98      0.98       385
                      Normal beat       0.92      0.99      0.95      1866
                            Other       0.00      0.00      0.00        55
Premature ventricular contraction       0.84      0.36      0.51       102
  Fusion of paced and normal beat       0.38      0.07      0.11        89

                         accuracy                           0.92      3301
                        macro avg       0.67      0.56      0.58      3301
                     weighted avg       0.89      0.92      0.90      3301



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [120]:
# Load the data
with open('./datasets/111annotations.txt', 'r') as file:
    annotations = file.readlines()
ecg_data = pd.read_csv('./datasets/111.csv')
annotations_df = parse_annotations(annotations)

# Correct the column names by removing extra quotation marks
ecg_data.columns = ecg_data.columns.str.strip("'")

# Sampling rate and duration for 5 minutes
sampling_rate = 360  # samples per second for MIT-BIH
duration = 5 * 60  # 5 minutes in seconds
num_samples = duration * sampling_rate

# Extract a 5-minute portion of the ECG data
start_sample = 0  # You can adjust this as needed
ecg_portion = ecg_data.iloc[start_sample:start_sample + num_samples].copy()
ecg_portion['ECG_baseline_removed'] = remove_baseline_wander(ecg_portion['MLII'])
ecg_portion['ECG_smoothed'] = smooth_signal(ecg_portion['ECG_baseline_removed'])

# Filter annotations to only include those within the 5-minute window
annotations_portion = annotations_df[(annotations_df['Sample #'] >= start_sample) & 
                                     (annotations_df['Sample #'] < start_sample + num_samples)]

# Segmentation and feature extraction on the 5-minute portion
ecg_segments = segment_ecg_signal(ecg_portion['ECG_smoothed'], annotations_portion)
ecg_features = extract_dwt_features(ecg_segments)

# Feature matrix and labels for the 5-minute portion
feature_matrix = np.array(ecg_features)
labels = annotations_portion['Type'].values[:len(ecg_features)]  # Ensuring labels align with the segments
print(pd.Series(labels).value_counts())
#labels = rename_unmapped_labels(labels, annotation_classes_map)

print()

L        347
Other      5
Name: count, dtype: int64



In [121]:
with open('datasets/109annotations.txt', 'r') as file:
    annotations_109 = file.readlines()
ecg_data_109 = pd.read_csv('datasets/109.csv')
annotations_df_109 = parse_annotations(annotations_109)

ecg_data_109.columns = ecg_data_109.columns.str.strip("'")

ecg_portion109 = ecg_data_109.iloc[start_sample:start_sample + num_samples].copy()
ecg_portion109['ECG_baseline_removed'] = remove_baseline_wander(ecg_portion109['MLII'])
ecg_portion109['ECG_smoothed'] = smooth_signal(ecg_portion109['ECG_baseline_removed'])

annotations_portion109 = annotations_df_109[(annotations_df_109['Sample #'] >= start_sample) & 
                                     (annotations_df_109['Sample #'] < start_sample + num_samples)]

# Segmentation and feature extraction on the 5-minute portion
ecg_segments109 = segment_ecg_signal(ecg_portion109['ECG_smoothed'], annotations_portion109)
ecg_features109 = extract_dwt_features(ecg_segments109)

# Feature matrix and labels for the 5-minute portion
feature_matrix109 = np.array(ecg_features109)
labels109 = annotations_portion109['Type'].values[:len(ecg_features109)]  # Ensuring labels align with the segments
#labels109 = rename_unmapped_labels(labels109, annotation_classes_map)

print(pd.Series(labels109).value_counts())
print()

L        422
V          6
Other      3
Name: count, dtype: int64



In [122]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(feature_matrix)
X_test_scaled = scaler.transform(feature_matrix109)

# SVM training with patient 111's data
svm_classifier = SVC(kernel='rbf', gamma='scale')
svm_classifier.fit(X_train_scaled, labels)

# Prediction and evaluation
y_pred = svm_classifier.predict(X_test_scaled)
accuracy = accuracy_score(labels109, y_pred)

# Generate the classification report for patient 109's data
classification_rep_109 = classification_report(labels109, y_pred, labels=np.unique(labels109), target_names=[annotation_classes_map.get(label, 'Other') for label in np.unique(labels109)], zero_division=0)

print(f'Accuracy for patient 109: {accuracy}')
print(classification_rep_109)

cm = confusion_matrix(labels109, y_pred, labels=unique_labels)
print()

# Calculate additional metrics from the confusion matrix if needed
TP = np.diag(cm)
FP = cm.sum(axis=0) - TP
FN = cm.sum(axis=1) - TP
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP / (TP + FN)
# Precision or positive predictive value
PPV = TP / (TP + FP)
# Overall accuracy for each class
ACC = (TP + TN) / (TP + FP + FN + TN)

# Print additional metrics
print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"True Negatives (TN): {TN}")
print(f"Sensitivity (TPR): {TPR}")
print(f"Precision (PPV): {PPV}")
print(f"Accuracy (ACC): {ACC}\n\n\n")

Accuracy for patient 109: 0.9791183294663574
                                   precision    recall  f1-score   support

    Left bundle branch block beat       0.98      1.00      0.99       422
                            Other       0.00      0.00      0.00         3
Premature ventricular contraction       0.00      0.00      0.00         6

                         accuracy                           0.98       431
                        macro avg       0.33      0.33      0.33       431
                     weighted avg       0.96      0.98      0.97       431


True Positives (TP): [422   0]
False Positives (FP): [3 0]
False Negatives (FN): [0 3]
True Negatives (TN): [  0 422]
Sensitivity (TPR): [1. 0.]
Precision (PPV): [0.99294118        nan]
Accuracy (ACC): [0.99294118 0.99294118]





  PPV = TP / (TP + FP)
