# Random Forest with FFT & Wavelet features

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import scipy.io
import numpy as np
import pandas as pd
import pywt
import matplotlib.pyplot as plt

from scipy.signal import butter, filtfilt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from itertools import product
from scipy.fft import fft

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score


In [None]:
!ls /content/drive/MyDrive/T2/'AI in Healthcare'/Project/Data

RECORDS  REFERENCE.csv	REFERENCE-original.csv	sample1_for_test  training2017	training2017_csv


In [None]:
data_path= '/content/drive/MyDrive/T2/AI in Healthcare/Project/Data/training2017_csv/full_ecg_with_answers.csv'

In [None]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,File_ID,Num_Leads,Sampling_Rate,Num_Samples,Gain,Offset,Signal_Type,ECG_Signal,Label
0,A08031,1,300,18000,1000.0,254,ECG,"[254, 259, 264, 245, 215, 181, 152, 133, 152, ...",N
1,A08033,1,300,9000,1000.0,216,ECG,"[216, 264, 315, 369, 402, 414, 421, 424, 424, ...",N
2,A08035,1,300,9000,1000.0,-403,ECG,"[-403, -510, -603, -683, -754, -803, -819, -82...",~
3,A08032,1,300,9000,1000.0,-230,ECG,"[-230, -282, -336, -396, -419, -426, -429, -43...",N
4,A08029,1,300,9000,1000.0,908,ECG,"[908, 1130, 1371, 1618, 1842, 2025, 2098, 2146...",N


In [None]:
### Functions ###

def pad_or_truncate(signal, target_length):
    if len(signal) > target_length:
        return signal[:target_length]
    elif len(signal) < target_length:
        return np.pad(signal, (0, target_length - len(signal)), 'constant')
    return signal

def apply_fft(signal):
    fft_values = np.abs(np.fft.fft(signal))[:len(signal)//2]  # take only positive frequencies
    return fft_values

def compute_derivative(signal):
    return np.diff(signal, prepend=signal[0])

def standardize_signal_column(signals):
    scaler = StandardScaler()
    return scaler.fit_transform(signals)

def extract_wavelet_energy(signal, wavelet='db4', level=5):
    coeffs = pywt.wavedec(signal, wavelet, level=level)
    energy = np.array([np.sum(c ** 2) for c in coeffs])
    return energy

In [None]:
### Pre-processing & Feature engineering ###

# Drop metadata columns
metadata_cols = ['File_ID', 'Num_Leads', 'Sampling_Rate', 'Num_Samples', 'Gain', 'Offset', 'Signal_Type']
df = df.drop(columns=metadata_cols)

# Convert string to array
df['ECG_Signal'] = df['ECG_Signal'].apply(lambda x: np.array(eval(x)))

# Convert label to numeric
label_mapping = {'N': 0, 'A': 1, 'O': 2, '~': 3}
df['Label_numeric'] = df['Label'].map(label_mapping)

# Pad to 18000
df['ECG_Signal'] = df['ECG_Signal'].apply(lambda x: pad_or_truncate(x, 18000))

# Add FFT
df['ECG_FFT'] = df['ECG_Signal'].apply(apply_fft)
df['ECG_FFT'] = df['ECG_FFT'].apply(lambda x: pad_or_truncate(x, 18000))

# Add Derivative
df['ECG_Deriv'] = df['ECG_Signal'].apply(compute_derivative)
df['ECG_Deriv'] = df['ECG_Deriv'].apply(lambda x: pad_or_truncate(x, 18000))

# Extract features: FFT + Wavelet
fft_features = np.stack(df['ECG_FFT'].values)
wt_features = np.stack(df['ECG_Signal'].apply(lambda x: extract_wavelet_energy(x)).values)

# Combine features: use first 100 FFT coeffs for dimensionality
X = np.hstack([fft_features[:, :100], wt_features])
y = df['Label_numeric'].values # label

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [None]:
### Model Training ###

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)

# Random Forest Model
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=14,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42
)

rf.fit(X_train, y_train)

# Evaluation
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Normal', 'AF', 'Other', 'Noisy']))

              precision    recall  f1-score   support

      Normal       0.73      0.71      0.72      1031
          AF       0.76      0.94      0.84      1031
       Other       0.78      0.60      0.68      1031
       Noisy       0.97      1.00      0.98      1031

    accuracy                           0.81      4124
   macro avg       0.81      0.81      0.81      4124
weighted avg       0.81      0.81      0.81      4124

