In [6]:
import os
import numpy as np
import pandas as pd
from io import StringIO
import gzip

In [25]:
def normalize_sensor_data_with_nan(X):
    # Initialize an array to hold normalized data
    X_norm = np.empty_like(X)
    
    for i in range(X.shape[1]):
        # Extract the column, handling NaNs
        column = X[:, i]
        valid_values = column[~np.isnan(column)]
        
        if valid_values.size > 0:
            col_min = valid_values.min()
            col_max = valid_values.max()
            
            if col_max > col_min:  # Avoid division by zero
                X_norm[:, i] = (column - col_min) / (col_max - col_min)
            else:
                X_norm[:, i] = 0.5  # If all valid values are the same, normalize to 0.5
        else:
            X_norm[:, i] = np.nan  # If all values are NaN, keep the column as NaN
    
    return np.round(X_norm, 3)

def format_input_data(X_norm):
    # Format the normalized sensor data with all entries in a single set of brackets and separated by spaces
    formatted_data = []
    for row in X_norm:
        formatted_row = "[" + " ".join([f"{value if not np.isnan(value) else 'nan'}" for value in row]) + "]"
        formatted_data.append(formatted_row)
    return formatted_data

def parse_header_of_csv(csv_str):
    # Isolate the headline columns:
    headline = csv_str[:csv_str.index(b'\n')]
    columns = headline.split(b',')

    # The first column should be timestamp:
    assert columns[0] == b'timestamp'
    # The last column should be label_source:
    assert columns[-1] == b'label_source'
    
    # Search for the column of the first label:
    for (ci, col) in enumerate(columns):
        if col.startswith(b'label:'):
            first_label_ind = ci
            break

    # Feature columns come after timestamp and before the labels:
    feature_names = columns[1:first_label_ind]
    # Then come the labels, till the one-before-last column:
    label_names = columns[first_label_ind:-1]
    for (li, label) in enumerate(label_names):
        # In the CSV the label names appear with prefix 'label:', but we don't need it after reading the data:
        assert label.startswith(b'label:')
        label_names[li] = label.replace(b'label:', b'')
    
    return (feature_names, label_names)

def parse_body_of_csv(csv_str, n_features):
    # Read the entire CSV body into a single numeric matrix:
    full_table = np.loadtxt(StringIO(csv_str.decode('utf-8')), delimiter=',', skiprows=1)
    
    # Timestamp is the primary key for the records (examples):
    timestamps = full_table[:, 0].astype(int)
    
    # Read the sensor features:
    X = full_table[:, 1:(n_features+1)]
    
    # Read the binary label values, and the 'missing label' indicators:
    trinary_labels_mat = full_table[:, (n_features+1):-1]  # This should have values of either 0., 1. or NaN
    M = np.isnan(trinary_labels_mat)  # M is the missing label matrix
    Y = np.where(M, 0, trinary_labels_mat) > 0.  # Y is the label matrix
    
    return timestamps, X, Y

def normalize_sensor_data(X):
    # Normalize sensor data between 0 and 1 with 3 decimals precision
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    X_norm = (X - X_min) / (X_max - X_min)
    return np.round(X_norm, 3)

def prettify_label_name(label):
    label_mapping = {
        'FIX_walking': 'Walking',
        'FIX_running': 'Running',
        'LOC_main_workplace': 'At main workplace',
        'OR_indoors': 'Indoors',
        'OR_outside': 'Outside',
        'LOC_home': 'At home',
        'FIX_restaurant': 'At a restaurant',
        'OR_exercise': 'Exercise',
        'LOC_beach': 'At the beach',
        'OR_standing': 'Standing',
        'WATCHING_TV': 'Watching TV'
    }
    return label_mapping.get(label, label)

def prettify_labels(label_names, Y):
    # Create a space-separated string of active labels for each timestamp using the prettified label names
    pretty_labels = []
    for y in Y:
        active_labels = [
            prettify_label_name(label.decode('utf-8')) 
            for label, active in zip(label_names, y) if active
        ]
        pretty_labels.append(" ".join(active_labels))
    return pretty_labels


prompt = 'You are provided with sensor readings that include high-frequency motion-reactive sensors (accelerometer, gyroscope, magnetometer, watch accelerometer), location services, audio, watch compass, phone state indicators, and additional low-frequency sensors sampled once per minute. The data may contain NaN values and each sensor value is normalized between 0 and 1. Based on these sensor readings, identify the users activity, which may include one or more of the following:\n"Phone on table", "Sitting", "Indoors", "At home", "Lying down", "Talking", "Sleeping", "At main workplace", "Phone in pocket", "Eating", "Watching TV", "Surfing the internet", "Standing", "Walking", "Outside", "With friends", "Phone in hand", "Computer work", "With co-workers", "Dressing", "Cooking", "Washing dishes", "On a bus", "Grooming", "Drive - Im the driver", "Toilet", "At school", "In a car", "Drinking (alcohol)", "In a meeting", "Drive - Im a passenger", "Bathing - shower", "Strolling", "Singing", "Shopping", "At a restaurant", "Doing laundry", "Running", "Exercise", "Stairs - going up", "Stairs - going down", "Bicycling", "Lab work", "In class", "Cleaning", "At a party", "At a bar", "At the beach", "At the gym", "Elevator", "Phone in bag".'

def create_csv_per_user(user_data_dir, output_dir):
    # Iterate over each user file in the provided directory
    for user_file in os.listdir(user_data_dir):
        if user_file.endswith('.csv.gz'):
            print(user_file)
            # Read the compressed CSV file
            with gzip.open(os.path.join(user_data_dir, user_file), 'rb') as f:
                csv_str = f.read()
            
            # Parse the header and body of the CSV
            feature_names, label_names = parse_header_of_csv(csv_str)
            timestamps, X, Y = parse_body_of_csv(csv_str, len(feature_names))
            
            # Normalize sensor data, handling NaNs
            X_norm = normalize_sensor_data_with_nan(X)
            
            # Format the normalized data
            input_data = format_input_data(X_norm)
            
            # Prettify labels
            pretty_labels = prettify_labels(label_names, Y)
            
            # Combine input (formatted normalized sensor data) and response (pretty labels)
            data = {'Instruction': prompt , 'Input': input_data, 'Response': pretty_labels}
            df = pd.DataFrame(data)
            
            # Write to CSV
            output_file = os.path.join(output_dir, f"{user_file[:-7]}.csv")
            df.to_csv(output_file, index=False)

# Example usage:
# create_csv_per_user_updated('/path/to/user_data', '/path/to/output_csvs')


In [27]:

# create_csv_per_user('/path/to/user_data', '/path/to/output_csvs')
create_csv_per_user('./', '../processed/')

83CF687B-7CEC-434B-9FE8-00C3D5799BE6.features_labels.csv.gz
481F4DD2-7689-43B9-A2AA-C8772227162B.features_labels.csv.gz
7CE37510-56D0-4120-A1CF-0E23351428D2.features_labels.csv.gz
1DBB0F6F-1F81-4A50-9DF4-CD62ACFA4842.features_labels.csv.gz
E65577C1-8D5D-4F70-AF23-B3ADB9D3DBA3.features_labels.csv.gz
2C32C23E-E30C-498A-8DD2-0EFB9150A02E.features_labels.csv.gz
A7599A50-24AE-46A6-8EA6-2576F1011D81.features_labels.csv.gz
7D9BB102-A612-4E2A-8E22-3159752F55D8.features_labels.csv.gz
B09E373F-8A54-44C8-895B-0039390B859F.features_labels.csv.gz
27E04243-B138-4F40-A164-F40B60165CF3.features_labels.csv.gz
78A91A4E-4A51-4065-BDA7-94755F0BB3BB.features_labels.csv.gz
40E170A7-607B-4578-AF04-F021C3B0384A.features_labels.csv.gz
BE3CA5A6-A561-4BBD-B7C9-5DF6805400FC.features_labels.csv.gz
CDA3BBF7-6631-45E8-85BA-EEB416B32A3C.features_labels.csv.gz
B7F9D634-263E-4A97-87F9-6FFB4DDCB36C.features_labels.csv.gz
B9724848-C7E2-45F4-9B3F-A1F38D864495.features_labels.csv.gz
33A85C34-CFE4-4732-9E73-0A7AC861B27A.fea

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
fw = load_dataset("../processed/", split="train")


Downloading data: 100%|██████████| 60/60 [00:00<00:00, 511500.49files/s]
Generating train split: 377346 examples [00:03, 96589.20 examples/s]


In [9]:
fw

Dataset({
    features: ['Instruction', 'Input', 'Response'],
    num_rows: 377346
})