In [5]:
import numpy as np
import matplotlib.pyplot as plt
from biosppy.signals import ecg
import scipy.signal as ss
from neurokit2 import ecg_delineate
from tqdm import tqdm
import pandas as pd

# Load data

In [11]:
# Load data
def load_data(train_path, test_path):
    train = pd.read_csv(train_path, index_col="id")
    test = pd.read_csv(test_path, index_col="id")
    return train, test

# Handle NaNs and normalize
def preprocess_signals(data):
    # Identify signal columns
    signal_cols = [col for col in data.columns if col.startswith('x')]

    # Normalize while preserving NaNs
    data[signal_cols] = data[signal_cols].apply(
        lambda x: (x - x.mean(skipna=True)) / x.std(skipna=True), axis=1
    )

    # Replace NaNs with 0 after normalization
    data[signal_cols] = data[signal_cols].fillna(0)

    return data

# Load train and test data
train_path = "data/train_inversion_nkecg_invert_NaN.csv"
test_path = "data/test_inversion_nkecg_invert_NaN.csv"
train, test = load_data(train_path, test_path)

# Preprocess signals  (already done if inversion file)
train = preprocess_signals(train)
test = preprocess_signals(test)

# Verify
print(f"Train shape: {train.shape}, Test shape: {test.shape}")


Train shape: (5117, 17808), Test shape: (3411, 17807)


# FEATURE EXTRACTION + TRAINING

In [12]:
def extract_features(signal, sampling_rate=300):
    features = {}
    try:
        # Patch scipy.signal.hamming only if it doesn't exist
        if not hasattr(ss, "hamming"):
            ss.hamming = np.hamming
            
        # Patch scipy.signal.boxcar if missing
        if not hasattr(ss, "boxcar"):
            def boxcar(M):
                return np.ones(M)
            ss.boxcar = boxcar
                    
        # Process the ECG signal
        _, filtered, rpeaks, _, _, _, heart_rate = ecg.ecg(signal, sampling_rate=sampling_rate, show=False)
        
        # Proper unpacking of ecg_delineate
        delineation_df, delineation_dict = ecg_delineate(filtered, rpeaks, sampling_rate, method="cwt")
        
        # Time-domain Features
        r_times = rpeaks / sampling_rate
        rr_intervals = np.diff(r_times)  # RR intervals
        features["mean_rr"] = np.mean(rr_intervals) if len(rr_intervals) > 0 else 0
        features["std_rr"] = np.std(rr_intervals) if len(rr_intervals) > 0 else 0
        features["min_rr"] = np.min(rr_intervals) if len(rr_intervals) > 0 else 0
        features["max_rr"] = np.max(rr_intervals) if len(rr_intervals) > 0 else 0

        # Amplitudes (R peaks)
        r_amplitudes = filtered[rpeaks]
        features["mean_r_amp"] = np.mean(r_amplitudes) if len(r_amplitudes) > 0 else 0
        features["std_r_amp"] = np.std(r_amplitudes) if len(r_amplitudes) > 0 else 0

        # Q peaks
        q_peaks = delineation_dict.get("ECG_Q_Peaks", [])
        q_peaks = [int(p) for p in q_peaks if not np.isnan(p) and int(p) < len(filtered)]  # Filter valid indices
        q_amplitudes = filtered[q_peaks] if len(q_peaks) > 0 else np.array([])
        features["mean_q_amp"] = np.mean(q_amplitudes) if len(q_amplitudes) > 0 else 0

        # QRS Duration
        s_peaks = delineation_dict.get("ECG_S_Peaks", [])
        s_peaks = [int(p) for p in s_peaks if not np.isnan(p) and int(p) < len(filtered)]  # Filter valid indices

        # Align Q and S peaks to the same length
        if len(q_peaks) > 0 and len(s_peaks) > 0:
            min_length = min(len(q_peaks), len(s_peaks))
            q_peaks_aligned = np.array(q_peaks[:min_length])
            s_peaks_aligned = np.array(s_peaks[:min_length])
            qrs_durations = s_peaks_aligned - q_peaks_aligned
            qrs_durations = qrs_durations / sampling_rate  # Convert to seconds
            features["mean_qrs_dur"] = np.mean(qrs_durations) if len(qrs_durations) > 0 else 0
        else:
            features["mean_qrs_dur"] = 0

        # Frequency Domain Features
        freq = np.fft.rfftfreq(len(signal), 1 / sampling_rate)
        spectrum = np.abs(np.fft.rfft(signal))
        features["fft_peak_freq"] = freq[np.argmax(spectrum)]
        features["fft_mean"] = np.mean(spectrum)
        features["fft_std"] = np.std(spectrum)

    except Exception as e:
        print(f"Error processing signal: {e}")
        # Fill all features with zeros if an error occurs
        features = {k: 0 for k in ["mean_rr", "std_rr", "min_rr", "max_rr",
                                   "mean_r_amp", "std_r_amp", "mean_q_amp",
                                   "mean_qrs_dur",
                                   "fft_peak_freq", "fft_mean", "fft_std"]}

    return features

In [13]:
from joblib import Parallel, delayed
from tqdm import tqdm

def process_row(row, signal_cols, sampling_rate):
    """
    Process a single row to extract features.

    Parameters:
        row (pd.Series): Single row of the DataFrame.
        signal_cols (list): List of signal column names.
        sampling_rate (int): Sampling rate for feature extraction.

    Returns:
        dict: Extracted features for the row.
    """
    signal = row[signal_cols].to_numpy(dtype="float32")
    features = extract_features(signal, sampling_rate=sampling_rate)  # Use your existing `extract_features` function
    features["id"] = row.name
    if "y" in row:
        features["label"] = row["y"]
    return features


def extract_features_from_df_parallel(data, sampling_rate=300, n_jobs=-1):
    """
    Extract features from a DataFrame in parallel.

    Parameters:
        data (pd.DataFrame): DataFrame containing signal data.
        sampling_rate (int): Sampling rate for feature extraction.
        n_jobs (int): Number of CPU cores to use (-1 for all cores).

    Returns:
        pd.DataFrame: DataFrame of extracted features.
    """
    signal_cols = [col for col in data.columns if col.startswith('x')]

    # Use Parallel for efficient processing
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_row)(row, signal_cols, sampling_rate)
        for _, row in tqdm(data.iterrows(), total=len(data), desc="Extracting Features in Parallel")
    )

    return pd.DataFrame(results)


# Extract features for train and test
train_features = extract_features_from_df_parallel(train, sampling_rate=300)
test_features = extract_features_from_df_parallel(test, sampling_rate=300)


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

In [15]:
print(f"Train features shape: {train_features.shape}, Test features shape: {test_features.shape}")

Train features shape: (5117, 13), Test features shape: (3411, 12)


# PIPELINE

In [36]:
from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier

# Prepare data for modeling
X = train_features.drop(columns=["id", "label"])  # Drop 'id' and 'label' to get only features
y = train_features["label"]  # Target variable

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define SVM pipeline with best parameters
svm_pipeline = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=6.5, probability=True, random_state=42)  # Updated SVM with optimal C
)

# Define Gradient Boosting pipeline with best parameters
gb_pipeline = make_pipeline(
    StandardScaler(),
    HistGradientBoostingClassifier(
        learning_rate=0.035,  # Updated learning rate
        max_iter=170,         # Updated max iterations
        random_state=42
    )
)

# Define XGBoost pipeline with best parameters
xgb_pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(
        learning_rate=0.25,  # Updated learning rate
        max_depth=4,         # Updated max depth
        eval_metric="mlogloss",  # Disable warning for evaluation metric
        random_state=42
    )
)

# Create the ensemble
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm_pipeline),
        ('gb', gb_pipeline),
        ('xgb', xgb_pipeline)
    ],
    voting='soft'  # Use soft voting to average probabilities
)

# Cross-validation
print("Performing cross-validation...")
cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring="f1_micro")
print(f"CV F1 Scores: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Train and evaluate
print("Fitting ensemble on training data...")
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_val)
f1 = f1_score(y_val, y_pred, average="micro")
print(f"Validation F1 Score: {f1:.4f}")

# Adjust test_features to drop the extra 'y' column
test_features_cleaned = test_features.drop(columns=["y"], errors="ignore")  # Drop 'y' if it exists

# Predict on test set
print("Predicting on test set...")
X_test = test_features_cleaned.drop(columns=["id"])  # Features only
test_predictions = test_features_cleaned[["id"]].copy()  # Preserve 'id' column for submission
test_predictions["y"] = ensemble.predict(X_test)  # Add predictions to the copied DataFrame

# Save submission
submission = test_predictions[["id", "y"]]
submission.to_csv("out/submission19_ensemble_SVM_GB_XGB_Voting_GRIDSEARCHBEST_nkecg_invert_normal_then_0.csv", index=False)
print("Submission file created!")


Performing cross-validation...
CV F1 Scores: 0.7879 ± 0.0239
Fitting ensemble on training data...
Validation F1 Score: 0.7861
Predicting on test set...
Submission file created!


CV F1 Scores: 0.9444 ± 0.0150
Validation F1 Score: 0.7715
Submission file created!

# Hyperparameter Tuning with Grid Search

In [35]:
from sklearn.ensemble import VotingClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import numpy as np

# Prepare data for modeling
X = train_features.drop(columns=["id", "label"])  # Drop 'id' and 'label' to get only features
y = train_features["label"]  # Target variable

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define individual pipelines
svm_pipeline = make_pipeline(
    StandardScaler(),
    SVC(probability=True, random_state=42)
)

gb_pipeline = make_pipeline(
    StandardScaler(),
    HistGradientBoostingClassifier(random_state=42)
)

xgb_pipeline = make_pipeline(
    StandardScaler(),
    XGBClassifier(eval_metric="mlogloss", random_state=42)
)

# Further refined parameter grid
param_grid = {
    'svm__svc__C': [6.5, 7.0, 7.5],  # Narrowing down around 7.0
    'gb__histgradientboostingclassifier__learning_rate': [0.03, 0.035, 0.04],  # Narrow range around 0.035
    'gb__histgradientboostingclassifier__max_iter': [160, 170, 180],  # Tight range around 170
    'xgb__xgbclassifier__max_depth': [3, 4, 5],  # Same range as before
    'xgb__xgbclassifier__learning_rate': [0.24, 0.25, 0.26],  # Narrow range around 0.25
}

# Create the ensemble
ensemble = VotingClassifier(
    estimators=[
        ('svm', svm_pipeline),
        ('gb', gb_pipeline),
        ('xgb', xgb_pipeline),
    ],
    voting='soft'  # Use soft voting to average probabilities
)

# Perform Grid Search with verbose output
grid_search = GridSearchCV(
    estimator=ensemble,
    param_grid=param_grid,
    scoring="f1_micro",
    cv=3,
    verbose=3,  # Verbose for detailed output
    n_jobs=-1  # Use all available CPUs
)

# Fit grid search
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)

# Print best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

# Evaluate on validation set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
f1 = f1_score(y_val, y_pred, average="micro")
print(f"Validation F1 Score with Best Model: {f1:.4f}")


Starting Grid Search...
Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV 3/3] END gb__histgradientboostingclassifier__learning_rate=0.035, gb__histgradientboostingclassifier__max_iter=160, svm__svc__C=12.0, xgb__xgbclassifier__learning_rate=0.28, xgb__xgbclassifier__max_depth=3;, score=0.799 total time=   1.9s
[CV 2/3] END gb__histgradientboostingclassifier__learning_rate=0.03, gb__histgradientboostingclassifier__max_iter=150, svm__svc__C=7.0, xgb__xgbclassifier__learning_rate=0.25, xgb__xgbclassifier__max_depth=3;, score=0.783 total time=   1.9s
[CV 1/3] END gb__histgradientboostingclassifier__learning_rate=0.03, gb__histgradientboostingclassifier__max_iter=150, svm__svc__C=7.0, xgb__xgbclassifier__learning_rate=0.25, xgb__xgbclassifier__max_depth=5;, score=0.762 total time=   1.8s
[CV 2/3] END gb__histgradientboostingclassifier__learning_rate=0.03, gb__histgradientboostingclassifier__max_iter=150, svm__svc__C=7.0, xgb__xgbclassifier__learning_rate=0.27, xgb__xgbclas

Best Parameters: {'gb__histgradientboostingclassifier__learning_rate': 0.035, 'gb__histgradientboostingclassifier__max_iter': 170, 'svm__svc__C': 6.5, 'xgb__xgbclassifier__learning_rate': 0.25, 'xgb__xgbclassifier__max_depth': 4}
Best CV Score: 0.7909

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 3274, number of used features: 11
[LightGBM] [Info] Start training from score -0.519209
[LightGBM] [Info] Start training from score -2.437776
[LightGBM] [Info] Start training from score -1.251084
[LightGBM] [Info] Start training from score -3.459039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2805
[LightGBM] [Info] Number of data points in the train set: 3274, number of used features: 11
[LightGBM] [Info] Start training from score -0.519209
[LightGBM] [Info] Start training from score -2.441279
[LightGBM] [Info] Start training from score -1.251084
[LightGBM] [Info] Start training from score -3