# Assignment 3: PPG Signal Processing, Dataset Merging, Visualization, and Predictive Modeling

This notebook covers:
1. Merging PPG signal data with HR and HRV from the master dataset
2. Visualizing merged PPG data
3. Combining and splitting data for modeling
4. Building and evaluating regression models to predict HR

In [10]:
import pandas as pd
import numpy as np
import os
import glob
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Task 1: Merge Data
For each subject, merge their PPG signal data with their corresponding HR and HRV values from the master dataset based on Subject_ID. Save each merged dataset as a new CSV file (one per subject) in a separate folder.

In [11]:
# Folder containing subject CSVs and master dataset path
ppg_folder = '../09-07-25/Visualization/PPG_Subjects'
master_path = r'c:/Users/suyas/OneDrive/Desktop/CODING/CODING/Vocational Training/IIIT Naya Raipur/Python/09-07-25/Visualization/master_dataset.csv'
merged_folder = os.path.join(ppg_folder, 'merged_subjects')
os.makedirs(merged_folder, exist_ok=True)

# Load master dataset
master_df = pd.read_csv(master_path)

# Find all subject CSV files
subject_files = glob.glob(os.path.join(ppg_folder, 'subject_*.csv'))

# Merge and save
for file in subject_files:
    sub_df = pd.read_csv(file)
    sid = sub_df['Subject_ID'].iloc[0]
    hr = master_df.loc[master_df['Subject_ID'] == sid, 'HR'].values[0]
    hrv = master_df.loc[master_df['Subject_ID'] == sid, 'HRV'].values[0]
    sub_df['HR'] = hr
    sub_df['HRV'] = hrv
    out_path = os.path.join(merged_folder, os.path.basename(file))
    sub_df.to_csv(out_path, index=False)
    print(f'Merged and saved: {out_path}')

FileNotFoundError: [Errno 2] No such file or directory: 'c:/Users/suyas/OneDrive/Desktop/CODING/CODING/Vocational Training/IIIT Naya Raipur/Python/09-07-25/Visualization/master_dataset.csv'

# Task 2: Visualize Merged PPG Data
For each merged subject CSV file, plot an interactive line chart using Plotly showing Time vs PPG. Include title indicating Subject_ID.

In [None]:
# Visualize merged PPG data for each subject
merged_files = glob.glob(os.path.join(merged_folder, 'subject_*.csv'))
for file in merged_files:
    df = pd.read_csv(file)
    subject_id = df['Subject_ID'].iloc[0]
    fig = px.line(df, x='Time', y='PPG', title=f'Subject {subject_id} - PPG Signal', labels={'Time': 'Time (s)', 'PPG': 'PPG Value'})
    fig.show()

# Task 3: Combine and Split Data
Combine all the merged subject datasets into a single DataFrame. Split this combined dataset into a train and test set (80%-20% split).

In [None]:
# Combine all merged subject datasets and split into train/test sets
combined_df = pd.concat([pd.read_csv(f) for f in merged_files], ignore_index=True)
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42, shuffle=True)
print(f'Combined shape: {combined_df.shape}')
print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')

# Task 4: Build and Evaluate Models
Implement 4 regression models to predict HR based on PPG, HRV, Time. Evaluate and visualize error metrics and predictions.

In [None]:
# Build and evaluate 4 regression models to predict HR, plot error metrics and visualizations
features = ['PPG', 'HRV', 'Time']
target = 'HR'
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}

results = {}
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Plot error metrics comparison
results_df = pd.DataFrame(results).T
results_df[['MAE', 'MSE', 'RMSE', 'R2']].plot(kind='bar', figsize=(10,6))
plt.title('Model Comparison - Error Metrics')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.grid(True)
plt.show()

# Actual vs Predicted and Residuals plots for each model
for name in models:
    y_pred = predictions[name]
    # Actual vs Predicted
    plt.figure(figsize=(6, 4))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual HR')
    plt.ylabel('Predicted HR')
    plt.title(f'{name} - Actual vs Predicted')
    plt.grid(True)
    plt.show()
    # Residuals Plot
    residuals = y_test - y_pred
    plt.figure(figsize=(6, 4))
    plt.scatter(y_pred, residuals, alpha=0.7)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Predicted HR')
    plt.ylabel('Residuals')
    plt.title(f'{name} - Residuals Plot')
    plt.grid(True)
    plt.show()