# ECG Heart Disease Detection with Deep Learning

This notebook serves as a comprehensive guide for learning Docker and PyTorch to analyze ECG datasets for heart disease detection using deep learning algorithms.

## Project Overview
- **Dataset**: ~500,000 ECG recordings
- **Task**: Testing different deep learning algorithms for heart disease detection
- **Tools**: Docker, PyTorch, ECG analysis libraries
- **Goal**: Compare performance of various DL models for cardiovascular disease detection

## Learning Objectives
1. Master Docker container management for data science workflows
2. Understand ECG signal processing and analysis
3. Implement multiple deep learning architectures in PyTorch
4. Evaluate and compare model performance for medical diagnosis

## 1. Docker Container Setup and Management

Understanding Docker is essential for reproducible research environments and working with the professor's dataset container.

### Essential Docker Commands for ECG Analysis

Let's learn the key Docker commands you'll need for this project.

In [None]:
# Docker commands you'll need (run these in terminal/command prompt)

# 1. Build your development container
# docker build -t ecg-analysis .

# 2. Run container with Jupyter notebook
# docker run -p 8888:8888 -v $(pwd)/data:/app/data ecg-analysis

# 3. Run container interactively for development
# docker run -it -v $(pwd):/app ecg-analysis bash

# 4. List running containers
# docker ps

# 5. Stop a container
# docker stop <container_id>

# 6. Remove unused containers and images
# docker system prune

print("Docker setup commands ready!")
print("Remember to:")
print("1. Build the container: docker build -t ecg-analysis .")
print("2. Run with data volume: docker run -p 8888:8888 -v $(pwd)/data:/app/data ecg-analysis")
print("3. Access Jupyter at: http://localhost:8888")

## 2. Dataset Loading and Exploration

Once you receive the professor's container with the ECG dataset, you'll need to explore its structure and understand the data format.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wfdb
from pathlib import Path

# Set up paths for ECG data
DATA_PATH = "/app/data"  # Path inside Docker container
LOCAL_DATA_PATH = "./data"  # Local development path

# Check if we're running in container or locally
if os.path.exists(DATA_PATH):
    data_path = DATA_PATH
    print("Running in Docker container")
else:
    data_path = LOCAL_DATA_PATH
    print("Running locally")

print(f"Data path: {data_path}")

# Explore the dataset structure
def explore_dataset(data_path):
    """Explore the ECG dataset structure"""
    if not os.path.exists(data_path):
        print(f"Data directory {data_path} not found")
        print("This will work once you mount the professor's dataset")
        return
    
    print("Dataset exploration:")
    print(f"Total files: {len(list(Path(data_path).rglob('*')))}")
    
    # Look for common ECG file formats
    formats = ['.dat', '.hea', '.atr', '.csv', '.h5', '.npz']
    for fmt in formats:
        files = list(Path(data_path).rglob(f'*{fmt}'))
        if files:
            print(f"{fmt} files: {len(files)}")
    
    # List directory structure
    for root, dirs, files in os.walk(data_path):
        level = root.replace(data_path, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files[:5]:  # Show first 5 files
            print(f"{subindent}{file}")
        if len(files) > 5:
            print(f"{subindent}... and {len(files) - 5} more files")

explore_dataset(data_path)

## 3. Data Preprocessing for ECG Signals

ECG signals require specific preprocessing steps to ensure good model performance. This includes normalization, filtering, and handling different signal lengths.

In [None]:
from scipy import signal
from scipy.signal import butter, filtfilt
import neurokit2 as nk

# ECG Preprocessing Functions

def bandpass_filter(ecg_signal, lowcut=0.5, highcut=40, fs=500, order=4):
    """
    Apply bandpass filter to remove noise from ECG signal
    """
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    filtered_signal = filtfilt(b, a, ecg_signal)
    return filtered_signal

def normalize_ecg(ecg_signal, method='z-score'):
    """
    Normalize ECG signal using different methods
    """
    if method == 'z-score':
        return (ecg_signal - np.mean(ecg_signal)) / (np.std(ecg_signal) + 1e-8)
    elif method == 'min-max':
        return (ecg_signal - np.min(ecg_signal)) / (np.max(ecg_signal) - np.min(ecg_signal) + 1e-8)
    elif method == 'robust':
        median = np.median(ecg_signal)
        mad = np.median(np.abs(ecg_signal - median))
        return (ecg_signal - median) / (mad + 1e-8)
    else:
        raise ValueError("Method must be 'z-score', 'min-max', or 'robust'")

def pad_or_truncate(signal, target_length=5000):
    """
    Ensure all signals have the same length
    """
    if len(signal) > target_length:
        # Truncate from the center
        start = (len(signal) - target_length) // 2
        return signal[start:start + target_length]
    elif len(signal) < target_length:
        # Pad with zeros
        padding = target_length - len(signal)
        pad_left = padding // 2
        pad_right = padding - pad_left
        return np.pad(signal, (pad_left, pad_right), mode='constant', constant_values=0)
    return signal

def detect_r_peaks(ecg_signal, fs=500):
    """
    Detect R-peaks in ECG signal using NeuroKit2
    """
    try:
        # Clean the signal first
        cleaned_ecg = nk.ecg_clean(ecg_signal, sampling_rate=fs)
        
        # Find R-peaks
        peaks, info = nk.ecg_peaks(cleaned_ecg, sampling_rate=fs)
        
        return peaks['ECG_R_Peaks'], cleaned_ecg
    except:
        print("Warning: R-peak detection failed")
        return [], ecg_signal

# Example preprocessing pipeline
def preprocess_ecg_signal(signal, fs=500, target_length=5000):
    """
    Complete preprocessing pipeline for ECG signals
    """
    # 1. Bandpass filter
    filtered_signal = bandpass_filter(signal, fs=fs)
    
    # 2. Normalize
    normalized_signal = normalize_ecg(filtered_signal, method='z-score')
    
    # 3. Ensure consistent length
    processed_signal = pad_or_truncate(normalized_signal, target_length)
    
    return processed_signal

# Generate synthetic ECG for demonstration
def generate_synthetic_ecg(length=5000, fs=500, noise_level=0.1):
    """
    Generate a synthetic ECG signal for testing
    """
    t = np.arange(length) / fs
    
    # Basic ECG pattern (simplified)
    ecg = np.zeros(length)
    heart_rate = 72  # BPM
    beat_interval = fs * 60 / heart_rate
    
    for i in range(int(length / beat_interval)):
        beat_start = int(i * beat_interval)
        if beat_start + 100 < length:
            # Simplified QRS complex
            ecg[beat_start:beat_start+20] = np.sin(np.linspace(0, np.pi, 20)) * 0.5
            ecg[beat_start+20:beat_start+40] = np.sin(np.linspace(0, 2*np.pi, 20)) * 1.5
            ecg[beat_start+40:beat_start+60] = np.sin(np.linspace(0, np.pi, 20)) * -0.8
    
    # Add noise
    noise = np.random.normal(0, noise_level, length)
    ecg += noise
    
    return ecg, t

# Test preprocessing
print("Testing ECG preprocessing pipeline...")
synthetic_ecg, time_axis = generate_synthetic_ecg()
processed_ecg = preprocess_ecg_signal(synthetic_ecg)

print(f"Original signal shape: {synthetic_ecg.shape}")
print(f"Processed signal shape: {processed_ecg.shape}")
print(f"Signal stats - Mean: {np.mean(processed_ecg):.4f}, Std: {np.std(processed_ecg):.4f}")

## 4. PyTorch Environment Setup

Let's set up PyTorch and understand the basic operations you'll need for deep learning with ECG data.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Check PyTorch setup
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
    print(f"CUDA version: {torch.version.cuda}")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Basic PyTorch operations for ECG data
def torch_basics_demo():
    """Demonstrate basic PyTorch operations relevant to ECG analysis"""
    
    # Create a tensor representing ECG signals (batch_size, signal_length)
    batch_size = 4
    signal_length = 5000
    
    # Random ECG-like data
    ecg_batch = torch.randn(batch_size, signal_length, device=device)
    print(f"ECG batch shape: {ecg_batch.shape}")
    
    # Create labels (0: normal, 1: disease)
    labels = torch.randint(0, 2, (batch_size,), device=device)
    print(f"Labels: {labels}")
    
    # Basic operations
    print(f"Mean: {torch.mean(ecg_batch, dim=1)}")  # Mean per signal
    print(f"Std: {torch.std(ecg_batch, dim=1)}")    # Std per signal
    
    # Normalize each signal in the batch
    normalized_batch = (ecg_batch - torch.mean(ecg_batch, dim=1, keepdim=True)) / \
                      (torch.std(ecg_batch, dim=1, keepdim=True) + 1e-8)
    
    print(f"Normalized mean: {torch.mean(normalized_batch, dim=1)}")
    
    return ecg_batch, labels

# Custom Dataset class for ECG data
class ECGDataset(Dataset):
    """Custom Dataset for ECG signals"""
    
    def __init__(self, signals, labels, transform=None):
        self.signals = torch.FloatTensor(signals)
        self.labels = torch.LongTensor(labels)
        self.transform = transform
    
    def __len__(self):
        return len(self.signals)
    
    def __getitem__(self, idx):
        signal = self.signals[idx]
        label = self.labels[idx]
        
        if self.transform:
            signal = self.transform(signal)
        
        return signal, label

# Test PyTorch setup
print("Testing PyTorch setup...")
ecg_batch, labels = torch_basics_demo()

## 5. Basic Neural Network Implementation

Let's start with a simple feedforward neural network as a baseline for ECG classification.

In [None]:
class BasicECGClassifier(nn.Module):
    """Simple feedforward neural network for ECG classification"""
    
    def __init__(self, input_size=5000, hidden_sizes=[512, 256, 128], num_classes=2, dropout_rate=0.5):
        super(BasicECGClassifier, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(prev_size, num_classes))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        # Flatten the input if needed
        if x.dim() > 2:
            x = x.view(x.size(0), -1)
        return self.network(x)

# Create and test the basic model
def test_basic_model():
    model = BasicECGClassifier(input_size=5000, num_classes=2).to(device)
    
    # Test with random data
    batch_size = 4
    test_input = torch.randn(batch_size, 5000).to(device)
    
    with torch.no_grad():
        output = model(test_input)
        probabilities = F.softmax(output, dim=1)
    
    print(f"Model output shape: {output.shape}")
    print(f"Sample predictions: {probabilities}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    return model

# Training function for basic model
def train_basic_model(model, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    """Train the basic neural network"""
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_losses = []
    val_accuracies = []
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_loss = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        # Validation phase
        model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
        
        val_accuracy = correct / total
        val_accuracies.append(val_accuracy)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Acc: {val_accuracy:.4f}')
    
    return train_losses, val_accuracies

# Test the basic model
print("Testing basic neural network...")
basic_model = test_basic_model()