# SHL Multimodal Data Processing Notebook

This notebook is an enhanced version of the original `DATA_SHL.ipynb`. It's designed to extract and process **all available motion sensor data** from the SHL preview dataset, including:

- Accelerometer (Acc)
- Gyroscope (Gyr)
- Magnetometer (Mag)
- Linear Acceleration (LAcc)
- Gravity (Gra)
- Orientation (Ori)

This prepares the data for use in a multimodal Mixture-of-Experts (MoE) model.

In [9]:
import hickle as hkl
import numpy as np
import os
import pandas as pd
from scipy import signal
import zipfile
import requests
import time
from tqdm import tqdm

np.random.seed(0)

## 1. Helper Functions (Enhanced)

These functions are updated to handle multiple sensor modalities and ensure data integrity.

In [10]:
def download_url(url, save_path, chunk_size=8192, max_retries=5):
    """A robust download function with retries and a progress bar."""
    for attempt in range(max_retries):
        try:
            print(f"Downloading {url} (Attempt {attempt + 1}/{max_retries})...")
            with requests.get(url, stream=True, timeout=30) as r:
                r.raise_for_status()
                total_size = int(r.headers.get('content-length', 0))
                with open(save_path, 'wb') as fd, tqdm(
                    total=total_size, unit='iB', unit_scale=True, desc=os.path.basename(save_path)
                ) as pbar:
                    for chunk in r.iter_content(chunk_size=chunk_size):
                        if chunk:
                            fd.write(chunk)
                            pbar.update(len(chunk))
            print(f"\nFile successfully downloaded to: {save_path}")
            return True
        except (requests.exceptions.RequestException, requests.exceptions.ChunkedEncodingError) as e:
            print(f"\nDownload failed: {e}")
            if attempt < max_retries - 1:
                wait_time = 5 * (attempt + 1)
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Download failed.")
                return False

def process_label_for_window(labels):
    """Determines the most common label in a window."""
    unique_values, counts = np.unique(labels, return_counts=True)
    return unique_values[np.argmax(counts)]

def segment_data(data, window_size, step_size):
    """Segments time-series data into windows."""
    segments = []
    for i in range(0, data.shape[0] - window_size, step_size):
        segments.append(data[i:i + window_size, :])
    return np.asarray(segments)

def segment_labels(labels, window_size, step_size):
    """Segments labels and assigns one label per window."""
    segmented_labels = []
    for i in range(0, labels.shape[0] - window_size, step_size):
        segmented_labels.append(process_label_for_window(labels[i:i + window_size]))
    return np.asarray(segmented_labels)

def downsample_data_block(data_block, factor=2):
    """Downsamples a block of windowed data using a low-pass filter."""
    if factor <= 1:
        return data_block
    return signal.decimate(data_block, factor, axis=1) # decimate along the time axis (axis=1)

def find_and_union_nan_ranges(data_list):
    """Finds NaN ranges across multiple dataframes and unions them."""
    nan_ranges = []
    for data in data_list:
        nan_indices = np.unique(np.where(np.isnan(data))[0])
        if len(nan_indices) > 0:
            # Convert indices to ranges
            nums = sorted(set(nan_indices))
            gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
            edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
            nan_ranges.extend(list(zip(edges, edges)))
    
    # Union all ranges
    if not nan_ranges:
        return []
    b = []
    for begin, end in sorted(nan_ranges):
        if b and b[-1][1] >= begin - 1:
            b[-1][1] = max(b[-1][1], end)
        else:
            b.append([begin, end])
    return b

## 2. Data Download and Extraction

This section downloads and extracts the SHL Preview dataset.

In [11]:
# 替换 "2. Data Download and Extraction" 下的代码单元格

file_names = [
    "SHLDataset_preview_v1_part1.zip",
    "SHLDataset_preview_v1_part2.zip",
    "SHLDataset_preview_v1_part3.zip"
]
links = [
    "http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part1.zip",
    "http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part2.zip",
    "http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part3.zip"
]

download_dir = os.path.abspath("dataset/download")
extract_dir = os.path.abspath("dataset/extracted/") # 修正路径以匹配项目结构

os.makedirs(download_dir, exist_ok=True)
os.makedirs(extract_dir, exist_ok=True)

# --- 修正后的逻辑 ---

# 阶段 1: 确保所有文件都已下载
for file_name, link in zip(file_names, links):
    file_path = os.path.join(download_dir, file_name)
    if not os.path.exists(file_path):
        download_url(link, file_path)
    else:
        print(f"{file_name} already downloaded.")

# 阶段 2: 确保所有已下载的文件都被解压
print("\n--- Starting Extraction Check ---")
for file_name in file_names:
    file_path = os.path.join(download_dir, file_name)
    if os.path.exists(file_path):
        print(f"Extracting {file_name}...")
        try:
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            print(f"Successfully extracted {file_name}.")
        except Exception as e:
            print(f"Error extracting {file_name}: {e}")

print("--- Extraction process complete ---")

Downloading http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part1.zip (Attempt 1/5)...


SHLDataset_preview_v1_part1.zip: 100%|██████████| 2.90G/2.90G [04:28<00:00, 10.8MiB/s] 



File successfully downloaded to: /files1/Zilong/MazeruHAR/datasets/dataset/download/SHLDataset_preview_v1_part1.zip
Downloading http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part2.zip (Attempt 1/5)...


SHLDataset_preview_v1_part2.zip: 100%|██████████| 2.50G/2.50G [03:54<00:00, 10.6MiB/s] 



File successfully downloaded to: /files1/Zilong/MazeruHAR/datasets/dataset/download/SHLDataset_preview_v1_part2.zip
Downloading http://www.shl-dataset.org/wp-content/uploads/SHLDataset_preview_v1_part3.zip (Attempt 1/5)...


SHLDataset_preview_v1_part3.zip: 100%|██████████| 2.28G/2.28G [03:22<00:00, 11.2MiB/s] 



File successfully downloaded to: /files1/Zilong/MazeruHAR/datasets/dataset/download/SHLDataset_preview_v1_part3.zip

--- Starting Extraction Check ---
Extracting SHLDataset_preview_v1_part1.zip...
Successfully extracted SHLDataset_preview_v1_part1.zip.
Extracting SHLDataset_preview_v1_part2.zip...
Successfully extracted SHLDataset_preview_v1_part2.zip.
Extracting SHLDataset_preview_v1_part3.zip...
Successfully extracted SHLDataset_preview_v1_part3.zip.
--- Extraction process complete ---


## 3. Multimodal Data Processing

Here we define the sensors and columns we want to extract. The `_Motion.txt` file contains multiple sensors. We will extract them all.

In [12]:
BODY_LOCATIONS = ["Bag", "Hand", "Hips", "Torso"]
ROOT_DIRECTORY = 'dataset/extracted/SHLDataset_preview_v1'

# Define all sensors and their columns in _Motion.txt
# Format: { 'name': [col1, col2, ...], ... }
SENSOR_CONFIG = {
    'acc': [1, 2, 3],      # Accelerometer
    'gyro': [4, 5, 6],     # Gyroscope
    'mag': [7, 8, 9],      # Magnetometer
    'ori': [10, 11, 12, 13], # Orientation (Quaternion)
    'gra': [14, 15, 16],   # Gravity
    'lacc': [17, 18, 19]   # Linear Acceleration
}

if not os.path.exists(ROOT_DIRECTORY):
    raise FileNotFoundError(f"Directory '{ROOT_DIRECTORY}' does not exist. Please ensure the dataset is downloaded and extracted.")

user_dirs = [d for d in os.listdir(ROOT_DIRECTORY) if os.path.isdir(os.path.join(ROOT_DIRECTORY, d)) and 'User' in d]
print(f"Found {len(user_dirs)} user directories: {user_dirs}")

Found 3 user directories: ['User1', 'User2', 'User3']


In [13]:
print("Starting multimodal data processing...")
all_users_data = []
all_users_labels = []

for user_folder in sorted(user_dirs):
    print(f"\nProcessing {user_folder}...")
    user_path = os.path.join(ROOT_DIRECTORY, user_folder)
    time_folders = [d for d in os.listdir(user_path) if os.path.isdir(os.path.join(user_path, d))]
    
    user_sessions_data = []
    user_sessions_labels = []

    for time_folder in sorted(time_folders):
        session_path = os.path.join(user_path, time_folder)
        print(f"  - Processing session: {time_folder}")

        # Load label data first
        labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]

        # Load all motion data for all locations
        motion_data_per_location = {}
        for location in BODY_LOCATIONS:
            motion_file = os.path.join(session_path, f"{location}_Motion.txt")
            motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values

        # Clean NaN values across all locations simultaneously
        all_motion_dfs = [pd.DataFrame(data) for data in motion_data_per_location.values()]
        combined_df = pd.concat(all_motion_dfs, axis=1)
        nan_rows = combined_df.isnull().any(axis=1)
        
        labels_clean = labels_raw[~nan_rows]
        for location in BODY_LOCATIONS:
            motion_data_per_location[location] = motion_data_per_location[location][~nan_rows]

        # Process each body location as a separate client/stream
        for location in BODY_LOCATIONS:
            # Extract all configured sensors and concatenate them
            sensor_columns = []
            for sensor, cols in SENSOR_CONFIG.items():
                sensor_columns.append(motion_data_per_location[location][:, cols])
            multimodal_data = np.concatenate(sensor_columns, axis=1)
            
            # Segment data and labels
            data_segmented = segment_data(multimodal_data, window_size=256, step_size=128)
            labels_segmented = segment_labels(labels_clean, window_size=256, step_size=128)
            
            # Filter out 'null' class (label 0)
            non_null_indices = np.where(labels_segmented != 0)
            data_filtered = data_segmented[non_null_indices]
            labels_filtered = labels_segmented[non_null_indices]
            
            # Adjust labels to be 0-indexed
            labels_adjusted = labels_filtered - 1
            
            # Downsample the data
            data_downsampled = downsample_data_block(data_filtered, factor=2)

            if data_downsampled.shape[0] > 0:
                user_sessions_data.append(data_downsampled)
                user_sessions_labels.append(labels_adjusted)
    
    # Combine all sessions for the current user/location stream
    if user_sessions_data:
        all_users_data.extend(user_sessions_data)
        all_users_labels.extend(user_sessions_labels)

print("\nData processing finished for all users.")

Starting multimodal data processing...

Processing User1...
  - Processing session: 220617


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 260617


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 270617


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values



Processing User2...
  - Processing session: 140617


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 140717


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 180717


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values



Processing User3...
  - Processing session: 030717


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 070717


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values


  - Processing session: 140617


  labels_raw = pd.read_csv(os.path.join(session_path, 'Label.txt'), header=None, delim_whitespace=True).values[:, 1]
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values
  motion_data_per_location[location] = pd.read_csv(motion_file, header=None, delim_whitespace=True).values



Data processing finished for all users.


## 4. Normalization and Saving

The final step is to perform Z-score normalization across the entire dataset and save the processed data and labels.

In [14]:
# Stack all client data for global normalization
if all_users_data:
    stacked_data = np.vstack(all_users_data)
    print(f"Total stacked data shape for normalization: {stacked_data.shape}")

    # Z-score normalization
    mean = np.mean(stacked_data, axis=(0, 1))
    std = np.std(stacked_data, axis=(0, 1))
    std[std == 0] = 1 # Avoid division by zero

    # Normalize each client's data and save
    data_name = 'SHL_Multimodal'
    output_dir = os.path.join('datasetStandardized', data_name)
    os.makedirs(output_dir, exist_ok=True)

    normalized_clients_data = []
    for client_data_block in all_users_data:
        normalized_block = (client_data_block - mean) / std
        normalized_clients_data.append(normalized_block)

    # Save as a list of numpy arrays, which is what the original scripts expect
    hkl.dump(normalized_clients_data, os.path.join(output_dir, 'clientsData.hkl'))
    hkl.dump(all_users_labels, os.path.join(output_dir, 'clientsLabel.hkl'))
    
    print(f"\nNormalization complete.")
    print(f"Saved {len(normalized_clients_data)} clients/streams.")
    print(f"Data and labels saved in: {output_dir}")
else:
    print("No data was processed. Please check the source directories and file structures.")

Total stacked data shape for normalization: (640152, 128, 19)

Normalization complete.
Saved 36 clients/streams.
Data and labels saved in: datasetStandardized/SHL_Multimodal
