In [1]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Define label mappings for fine labels according to the document provided, using '' for ignored labels
FINE_LABEL_MAP = {
    0: '',
    1: 'Still;Stand;Outside',
    2: 'Still;Stand;Inside',
    3: 'Still;Sit;Outside',
    4: 'Still;Sit;Inside',
    5: 'Walking;Outside',
    6: 'Walking;Inside',
    7: 'Run',
    8: 'Bike',
    9: 'Car;Driver',
    10: 'Car;Passenger',
    11: 'Bus;Stand',
    12: 'Bus;Sit',
    13: 'Bus;Up;Stand',
    14: 'Bus;Up;Sit',
    15: 'Train;Stand',
    16: 'Train;Sit',
    17: 'Subway;Stand',
    18: 'Subway;Sit'
}

# Number of threads for parallel processing, adjustable variable
NUM_THREADS = 1  # Change this value as needed

def read_fine_labels(label_file_path):
    """
    Reads the Label.txt file and extracts fine labels from the third column, converting them to descriptive text.
    Filters out empty labels.
    
    Parameters:
        label_file_path (str): The path to the Label.txt file.
    
    Returns:
        list: A list of fine descriptive labels corresponding to each time step.
    """
    fine_labels = []
    try:
        # Read the labels from the Label.txt file
        label_data = pd.read_csv(label_file_path, sep="\s+", header=None)
        
        # Convert numeric labels to descriptive text using the provided map
        fine_labels = label_data.iloc[:, 2].apply(lambda x: FINE_LABEL_MAP.get(x, '')).tolist()

        # Keep only non-empty labels
        fine_labels = [label if label else None for label in fine_labels]
    except Exception as e:
        print(f"Error reading {label_file_path}: {e}")

    return fine_labels

def process_motion_data_chunked(file_path, fine_labels, chunk_size=100000):
    """
    Processes motion data in chunks to avoid memory overload and writes to a CSV file.
    
    Parameters:
        file_path (str): The path to the motion data file.
        fine_labels (list): The list of fine descriptive labels corresponding to each time step.
        chunk_size (int): The number of rows to process in each chunk.
    
    Returns:
        list: List of tuples containing sensor data and the corresponding fine label.
    """
    # Create an empty list to hold chunk results
    chunk_results = []

    try:
        # Read the file in chunks
        for i, chunk in enumerate(pd.read_csv(file_path, sep="\s+", header=None, chunksize=chunk_size)):
            # Convert each row of sensor data to a list including time
            sensor_data = chunk.apply(lambda row: [row[0]] + row[1:].tolist(), axis=1)  # Include time
            
            # Synchronize fine labels with sensor data for the current chunk
            chunk_fine_labels = fine_labels[i * chunk_size: i * chunk_size + len(chunk)]
            
            # Collect the chunk's results, skipping rows where the label is empty
            for data, label in zip(sensor_data, chunk_fine_labels):
                if label is not None:  # Skip rows where the label is empty
                    chunk_results.append((data, label))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

    return chunk_results

def process_motion_data_for_date(record_path):
    """
    Processes motion data files for a specific recording session (date) and exports them to a CSV file.
    
    Parameters:
        record_path (str): The directory path containing the .txt files for a specific date.
    """
    # Output file path
    record_folder = os.path.basename(record_path)
    output_csv = f"{record_folder}_motion_data.csv"
    
    # Read the fine labels from the corresponding Label.txt file
    label_file_path = os.path.join(record_path, 'Label.txt')
    fine_labels = read_fine_labels(label_file_path)

    # Write the header to the CSV file first
    with open(output_csv, 'w') as f:
        f.write('Sensor Data,Label\n')

    # Get the list of _Motion.txt files
    motion_files = [file for file in os.listdir(record_path) if "_Motion.txt" in file]

    # Process each motion data file in parallel
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        futures = [executor.submit(process_motion_data_chunked, os.path.join(record_path, file), fine_labels) for file in motion_files]

        for future in tqdm(futures, desc=f"Processing {record_folder}"):
            chunk_results = future.result()

            # Write chunk results to CSV
            with open(output_csv, 'a') as f:
                for sensor_data, label in chunk_results:
                    # Convert list to space-separated string for sensor data
                    sensor_data_str = ' '.join(map(str, sensor_data))
                    f.write(f'"[{sensor_data_str}]",{label}\n')

    print(f"Motion data for {record_folder} exported to {output_csv}")

def process_motion_data_per_date(root_dir):
    """
    Processes motion data files for User1 one by one and exports each date's data to a separate CSV file.
    
    Parameters:
        root_dir (str): The root directory containing the User1 subdirectories.
    """
    # Get all recording session paths (subdirectories) for User1
    record_paths = [os.path.join(root_dir, folder) for folder in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, folder))]

    # Process each record path serially
    for record_path in tqdm(record_paths, desc="Processing all dates"):
        process_motion_data_for_date(record_path)

# Define the root directory containing the User1 dataset
root_dir = 'User1/'  # Adjust this to your actual dataset path

# Call the function to process motion data and export each date's data to separate CSV files
process_motion_data_per_date(root_dir)


  label_data = pd.read_csv(label_file_path, sep="\s+", header=None)
  for i, chunk in enumerate(pd.read_csv(file_path, sep="\s+", header=None, chunksize=chunk_size)):
Processing 070717: 100%|██████████| 4/4 [04:02<00:00, 60.73s/it]
Processing all dates:  12%|█▎        | 1/8 [04:07<28:52, 247.47s/it]

Motion data for 070717 exported to 070717_motion_data.csv


Processing 140617: 100%|██████████| 4/4 [03:22<00:00, 50.73s/it]
Processing all dates:  25%|██▌       | 2/8 [07:33<22:19, 223.27s/it]

Motion data for 140617 exported to 140617_motion_data.csv


Processing 260617: 100%|██████████| 4/4 [03:36<00:00, 54.25s/it]
Processing all dates:  38%|███▊      | 3/8 [11:13<18:28, 221.78s/it]

Motion data for 260617 exported to 260617_motion_data.csv


Processing 180717: 100%|██████████| 4/4 [02:58<00:00, 44.56s/it]
Processing all dates:  50%|█████     | 4/8 [14:15<13:43, 205.77s/it]

Motion data for 180717 exported to 180717_motion_data.csv


Processing 220617: 100%|██████████| 4/4 [04:03<00:00, 60.98s/it]
Processing all dates:  62%|██████▎   | 5/8 [18:22<11:01, 220.66s/it]

Motion data for 220617 exported to 220617_motion_data.csv


Processing 030717: 100%|██████████| 4/4 [02:55<00:00, 43.81s/it]
Processing all dates:  75%|███████▌  | 6/8 [21:20<06:52, 206.19s/it]

Motion data for 030717 exported to 030717_motion_data.csv


Processing 140717: 100%|██████████| 4/4 [03:13<00:00, 48.33s/it]
Processing all dates:  88%|████████▊ | 7/8 [24:36<03:22, 202.97s/it]

Motion data for 140717 exported to 140717_motion_data.csv


Processing 270617: 100%|██████████| 4/4 [03:11<00:00, 47.86s/it]
Processing all dates: 100%|██████████| 8/8 [27:51<00:00, 208.88s/it]

Motion data for 270617 exported to 270617_motion_data.csv



