<a href="https://colab.research.google.com/github/tamandakaunda-15/Formative2_HMMs/blob/main/Hidden_Markov_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install hmmlearn scikit-learn

import os
import pandas as pd
import numpy as np
import re
from scipy.stats import pearsonr
from scipy.fft import fft, fftfreq
from sklearn.preprocessing import StandardScaler
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (165 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hmmlearn
Successfully installed hmmlearn-0.3.3


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DRIVE_PATH = '/content/drive/MyDrive/Formative_2/HMM_Project_Data_UNPROCESSED/'

print(f"\nConfiguration complete. Data will be loaded from: {DRIVE_PATH}")


Configuration complete. Data will be loaded from: /content/drive/MyDrive/Formative_2/HMM_Project_Data_UNPROCESSED/


##Feature Extraction

In [18]:
#  Feature Extraction Functions

WINDOW_SIZE_HZ = 100 # sampling rate of 100 Hz

def extract_features(merged_df):
    """
    Extracts 19 key features from a single merged 10-second sample.
    (Time-Domain, Magnitude, Correlation, Frequency-Domain)
    """
    features = {}

    accel_cols = ['Acc_x', 'Acc_y', 'Acc_z']
    gyro_cols = ['Gyro_x', 'Gyro_y', 'Gyro_z']
    all_cols = accel_cols + gyro_cols

    df = merged_df.copy()
    N = len(df)

    if N == 0:
        # Handle empty DataFrame case (return zeros)
        feature_names = [f'{c}_mean' for c in all_cols] + [f'{c}_std' for c in all_cols] + \
                        ['SMA', 'Acc_xy_corr', 'Acc_xz_corr', 'Acc_yz_corr'] + \
                        [f'{c}_dom_freq' for c in all_cols]
        return pd.Series(0.0, index=feature_names)

    # 1. Time-Domain Features (Mean, Standard Deviation)
    for col in all_cols:
        features[f'{col}_mean'] = df[col].mean()
        features[f'{col}_std'] = df[col].std()

    # 2. Combined/Magnitude Features
    df['Acc_Mag'] = np.sqrt(df['Acc_x']**2 + df['Acc_y']**2 + df['Acc_z']**2)
    features['SMA'] = df['Acc_Mag'].abs().mean()

    # 3. Correlation between Accelerometer Axes
    try:
        features['Acc_xy_corr'] = pearsonr(df['Acc_x'], df['Acc_y'])[0]
        features['Acc_xz_corr'] = pearsonr(df['Acc_x'], df['Acc_z'])[0]
        features['Acc_yz_corr'] = pearsonr(df['Acc_y'], df['Acc_z'])[0]
    except ValueError:
        features['Acc_xy_corr'] = 0.0
        features['Acc_xz_corr'] = 0.0
        features['Acc_yz_corr'] = 0.0

    # 4. Frequency-Domain Features (Dominant Frequency)
    for col in all_cols:
        signal = df[col].values
        yf = fft(signal)
        power_spectrum = np.abs(yf[:N//2])**2
        xf = fftfreq(N, 1/WINDOW_SIZE_HZ)[:N//2]

        # Dominant Frequency (skip DC component)
        if len(power_spectrum) > 1:
            dominant_freq_index = np.argmax(power_spectrum[1:]) + 1
            features[f'{col}_dom_freq'] = xf[dominant_freq_index]
        else:
            features[f'{col}_dom_freq'] = 0.0

    return pd.Series(features)

def process_all_samples(base_dir):
    """Loads, merges, extracts features, and normalizes the entire dataset."""
    all_features = []
    all_labels = []

    activities = ['standing', 'walking', 'jumping', 'still']

    for item_name in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, item_name)

        if os.path.isdir(folder_path):

            # Individual Activity Label Extraction
            name = item_name.lower()
            label = None
            for act in activities:
                match = re.search(r'^{}|[_-]{}'.format(act, act), name)
                if match:
                    label = act.capitalize()
                    break

            if label is None:
                continue

            #  Loading and Merging CSVs (Accelerometer and Gyroscope) for each activity sample
            try:
                accel_path = os.path.join(folder_path, 'Accelerometer.csv')
                gyro_path = os.path.join(folder_path, 'Gyroscope.csv')

                # Load and rename columns
                accel_df = pd.read_csv(accel_path).rename(columns={'x': 'Acc_x', 'y': 'Acc_y', 'z': 'Acc_z'})
                gyro_df = pd.read_csv(gyro_path).rename(columns={'x': 'Gyro_x', 'y': 'Gyro_y', 'z': 'Gyro_z'})

                # MERGING THE 2 CVS into dataframes
                merged_df = pd.merge(accel_df, gyro_df, on=['time', 'seconds_elapsed'], how='inner')

                # Feature Extraction
                features = extract_features(merged_df)

                all_features.append(features)
                all_labels.append(label)

            except FileNotFoundError:
                print(f"Skipping: Missing files in {item_name}")
            except Exception as e:
                print(f"Skipping: Error processing {item_name}: {e}")

    feature_matrix = pd.DataFrame(all_features)
    label_series = pd.Series(all_labels, name='Activity_Label')

    # --- Feature Normalization (Z-score standardisation) ---
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(feature_matrix)
    normalized_df = pd.DataFrame(normalized_features, columns=feature_matrix.columns)

    return normalized_df, label_series

In [19]:
# Run the full processing pipeline
feature_matrix, activity_labels = process_all_samples(DRIVE_PATH)

print(f"Successfully processed {len(feature_matrix)} total samples.")
print("Feature Matrix Shape:", feature_matrix.shape)
print("\nActivity Distribution:")
print(activity_labels.value_counts())



Successfully processed 51 total samples.
Feature Matrix Shape: (51, 22)

Activity Distribution:
Activity_Label
Walking     16
Standing    12
Still       12
Jumping     11
Name: count, dtype: int64


## Defining Model Components

In [26]:
# Prepare Training and Test Data (for Tasks 4 & 5)
# a 70/30 split for training/testing.
X_train, X_test, y_train, y_test = train_test_split(
    feature_matrix, activity_labels, test_size=0.3, random_state=42, stratify=activity_labels
)


In [27]:
# Convert labels to numerical format (required by hmmlearn)
label_map = {name: i for i, name in enumerate(activity_labels.unique())}
y_train_num = y_train.map(label_map).values
y_test_num = y_test.map(label_map).values

# Inputing a a single sequence (or list of sequences) for a Hidden Markov Model
X_train_sequence = X_train.values

# Store the final objects for the next steps
DATA = {
    'X_train': X_train_sequence,
    'X_test': X_test.values,
    'y_test_num': y_test_num,
    'label_map': label_map,
    'reverse_label_map': {v: k for k, v in label_map.items()}
}