In [15]:
import pandas as pd
import glob

def compute_features_for_30s(df, df_label_time):
    # Check if required columns are present in the input dataframe
    required_columns = ['ts', 'ch1', 'ch2', 'ch3']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
    
    # Check if 'start' and 'name' columns are present in df_label_time
    if 'start' not in df_label_time.columns or 'name' not in df_label_time.columns:
        raise ValueError(f"Columns 'start' or 'name' not found in the df_label_time DataFrame.")
    
    feature_list = []

    # Use the 'start' column from df_label_time to determine the chunks
    start_times = df_label_time['start'].tolist()
    names = df_label_time['name'].tolist()
    
    for i in range(len(start_times) - 1):
        chunk = df[(df['ts'] >= start_times[i]) & (df['ts'] < start_times[i + 1])]
#
        # Check if the chunk is empty
        if chunk.empty:
            print(f"Warning: No data found for start time {start_times[i]} to {start_times[i + 1]}. Skipping...")
            continue
#            
        # Compute features
        means = chunk[['ch1', 'ch2', 'ch3']].mean()
        stds = chunk[['ch1', 'ch2', 'ch3']].std()
        vars = chunk[['ch1', 'ch2', 'ch3']].var()
        maxs = chunk[['ch1', 'ch2', 'ch3']].max()
        mins = chunk[['ch1', 'ch2', 'ch3']].min()

        # Store features in a dictionary
        features = {
            'ts': chunk['ts'].iloc[0],
            'mean_ch1': means['ch1'], 'mean_ch2': means['ch2'], 'mean_ch3': means['ch3'],
            'std_ch1': stds['ch1'], 'std_ch2': stds['ch2'], 'std_ch3': stds['ch3'],
            'var_ch1': vars['ch1'], 'var_ch2': vars['ch2'], 'var_ch3': vars['ch3'],
            'max_ch1': maxs['ch1'], 'max_ch2': maxs['ch2'], 'max_ch3': maxs['ch3'],
            'min_ch1': mins['ch1'], 'min_ch2': mins['ch2'], 'min_ch3': mins['ch3'],
        } 
        
        # Add the label 'name' for this chunk
        features['name'] = names[i]

        feature_list.append(features)
    
    # Convert the list of dictionaries into a dataframe
    features_df = pd.DataFrame(feature_list)
    
    return features_df

def combine_features_from_files(file_list, staging_file_list):
    combined_features_list = []
    
    for file, staging_file in zip(file_list, staging_file_list):
        df = pd.read_csv(file)
        df_label_time = pd.read_csv(staging_file)
        
        # Print the length of each _acc file
        print(f"Length of {file.split('/')[-1]}: {len(df)} rows")
        # Compute features for chunks based on start times from df_label_time
        features_30s = compute_features_for_30s(df, df_label_time)
        
        # Append to the list
        combined_features_list.append(features_30s)
    
    # Concatenate the dataframes
    combined_features = pd.concat(combined_features_list, ignore_index=True)
    
    return combined_features


# Get all CSV files that end with _acc and _staging from the specified directory
path = '/home/dgurve/scratch/Muse_synced_csv/'
acc_files = sorted(glob.glob(path + '*_acc.csv'))
staging_files = sorted(glob.glob(path + '*_staging.csv'))

# Extract prefixes
acc_prefixes = [file.split('_acc.csv')[0] for file in acc_files]
staging_prefixes = [file.split('_staging.csv')[0] for file in staging_files]

# Ensure that for each _acc file there's a corresponding _staging file
if len(acc_files) != len(staging_files) or acc_prefixes != staging_prefixes:
    raise ValueError("Mismatch in number of _acc and _staging files or their prefixes")

# Combine features from all files
combined_acc_features = combine_features_from_files(acc_files, staging_files)

# Print the length of the final combined features file
print(f"Length of combined features file: {len(combined_acc_features)} rows")

#combined_acc_features



Length of 2020-07-21T210319-0400_5007-ELYP-1F41_synced_acc.csv: 1450599 rows
Length of 2020-07-23T205958-0400_5007-7WNR-1FDA_synced_acc.csv: 1310558 rows
Length of 2020-08-08T215405-0400_5007-7WNR-1FDA_synced_acc.csv: 1437655 rows
Length of 2020-08-10T211244-0400_5007-ELYP-1F41_synced_acc.csv: 1390677 rows
Length of 2020-08-12T210635-0400_5007-7WNR-1FDA_synced_acc.csv: 518465 rows
Length of 2020-08-13T220438-0400_5007-ELYP-1F41_synced_acc.csv: 1558318 rows
Length of 2020-08-15T212307-0400_5007-7WNR-1FDA_synced_acc.csv: 1558478 rows
Length of 2020-08-17T211411-0400_5007-ELYP-1F41_synced_acc.csv: 1419898 rows
Length of 2020-08-20T211218-0400_5007-ELYP-1F41_synced_acc.csv: 1524371 rows
Length of 2020-08-24T215137-0400_5007-7WNR-1FDA_synced_acc.csv: 1416148 rows
Length of 2020-09-04T223300-0400_5007-7WNR-1FDA_synced_acc.csv: 1300015 rows
Length of 2020-09-09T204008-0400_5007-ELYP-1F41_synced_acc.csv: 968129 rows
Length of 2020-09-11T211719-0400_5007-E

In [18]:
path = '/home/dgurve/SRI/MUSE/'
combined_acc_features.to_csv(path +'combined_acc_features.csv', index=False)


Extract features only from n=X files

In [13]:
import pandas as pd
import glob

def compute_features_for_30s(df, df_label_time):
    # Check if required columns are present in the input dataframe
    required_columns = ['ts', 'ch1', 'ch2', 'ch3']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in the DataFrame.")
    
    # Check if 'start' and 'name' columns are present in df_label_time
    if 'start' not in df_label_time.columns or 'name' not in df_label_time.columns:
        raise ValueError(f"Columns 'start' or 'name' not found in the df_label_time DataFrame.")
    
    feature_list = []

    # Use the 'start' column from df_label_time to determine the chunks
    start_times = df_label_time['start'].tolist()
    names = df_label_time['name'].tolist()
    
    for i in range(len(start_times) - 1):
        chunk = df[(df['ts'] >= start_times[i]) & (df['ts'] < start_times[i + 1])]
##
    #     # Check if the chunk is empty
    #     if chunk.empty:
    #         print(f"Warning: No data found for start time {start_times[i]} to {start_times[i + 1]}. Skipping...")
    #         continue
##            
        # Compute features
        means = chunk[['ch1', 'ch2', 'ch3']].mean()
        stds = chunk[['ch1', 'ch2', 'ch3']].std()
        vars = chunk[['ch1', 'ch2', 'ch3']].var()
        maxs = chunk[['ch1', 'ch2', 'ch3']].max()
        mins = chunk[['ch1', 'ch2', 'ch3']].min()

        # Store features in a dictionary
        features = {
            'ts': chunk['ts'].iloc[0],
            'mean_ch1': means['ch1'], 'mean_ch2': means['ch2'], 'mean_ch3': means['ch3'],
            'std_ch1': stds['ch1'], 'std_ch2': stds['ch2'], 'std_ch3': stds['ch3'],
            'var_ch1': vars['ch1'], 'var_ch2': vars['ch2'], 'var_ch3': vars['ch3'],
            'max_ch1': maxs['ch1'], 'max_ch2': maxs['ch2'], 'max_ch3': maxs['ch3'],
            'min_ch1': mins['ch1'], 'min_ch2': mins['ch2'], 'min_ch3': mins['ch3'],
        } 
        
        # Add the label 'name' for this chunk
        features['name'] = names[i]

        feature_list.append(features)
    
    # Convert the list of dictionaries into a dataframe
    features_df = pd.DataFrame(feature_list)
    
    return features_df

def combine_features_from_files(file_list, staging_file_list):
    combined_features_list = []
    
    for file, staging_file in zip(file_list, staging_file_list):
        df = pd.read_csv(file)
        df_label_time = pd.read_csv(staging_file)
        
        # Print the length of each _acc file
        print(f"Length of {file.split('/')[-1]}: {len(df)} rows")
        # Compute features for chunks based on start times from df_label_time
        features_30s = compute_features_for_30s(df, df_label_time)
        
        # Append to the list
        combined_features_list.append(features_30s)
    
    # Concatenate the dataframes
    combined_features = pd.concat(combined_features_list, ignore_index=True)
    
    return combined_features


# Number of files to process
n = 2  # Change this value to your desired number of files

# Get all CSV files that end with _acc and _staging from the specified directory
path = '/home/dgurve/scratch/Muse_synced_csv/'
acc_files = sorted(glob.glob(path + '*_acc.csv'))[:n] # slicing here
staging_files = sorted(glob.glob(path + '*_staging.csv'))[:n] # slicing here


# Extract prefixes
acc_prefixes = [file.split('_acc.csv')[0] for file in acc_files]
staging_prefixes = [file.split('_staging.csv')[0] for file in staging_files]

# Ensure that for each _acc file there's a corresponding _staging file
if len(acc_files) != len(staging_files) or acc_prefixes != staging_prefixes:
    raise ValueError("Mismatch in number of _acc and _staging files or their prefixes")

# Combine features from all files
combined_acc_features = combine_features_from_files(acc_files, staging_files)

# Print the length of the final combined features file
print(f"Length of combined features file: {len(combined_acc_features)} rows")

combined_acc_features



Length of 2020-07-21T210319-0400_5007-ELYP-1F41_synced_acc.csv: 1450599 rows
Length of 2020-07-23T205958-0400_5007-7WNR-1FDA_synced_acc.csv: 1310558 rows
Length of combined features file: 1801 rows


Unnamed: 0,ts,mean_ch1,mean_ch2,mean_ch3,std_ch1,std_ch2,std_ch3,var_ch1,var_ch2,var_ch3,max_ch1,max_ch2,max_ch3,min_ch1,min_ch2,min_ch3,name
0,1595383586,-0.940744,0.235863,0.130572,0.001974,0.002763,0.003248,0.000004,0.000008,0.000011,-0.934021,0.244263,0.144104,-0.948120,0.226440,0.117981,2
1,1595383616,-0.940632,0.235279,0.132062,0.002029,0.002960,0.003115,0.000004,0.000009,0.000010,-0.933777,0.252197,0.145142,-0.947327,0.220215,0.117126,2
2,1595383646,-0.940535,0.235567,0.133202,0.002018,0.002703,0.003093,0.000004,0.000007,0.000010,-0.934204,0.247009,0.147034,-0.948242,0.224487,0.120117,3
3,1595383676,-0.940311,0.235519,0.133878,0.002020,0.002773,0.003201,0.000004,0.000008,0.000010,-0.933105,0.244995,0.148193,-0.946716,0.224976,0.120972,3
4,1595383706,-0.940272,0.235398,0.134502,0.001960,0.002853,0.002996,0.000004,0.000008,0.000009,-0.933533,0.246033,0.150024,-0.949585,0.224426,0.123352,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1796,1595584547,-0.751157,-0.578162,-0.182820,0.001856,0.002149,0.003026,0.000003,0.000005,0.000009,-0.745056,-0.571411,-0.170105,-0.757935,-0.586609,-0.191589,1
1797,1595584577,-0.752132,-0.577720,-0.179930,0.001995,0.002164,0.003469,0.000004,0.000005,0.000012,-0.745789,-0.571106,-0.164490,-0.759521,-0.584656,-0.189758,1
1798,1595584607,-0.754658,-0.574828,-0.179670,0.004087,0.003967,0.007205,0.000017,0.000016,0.000052,-0.738892,-0.564819,-0.162231,-0.764709,-0.597595,-0.218811,1
1799,1595584637,-0.760625,-0.568370,-0.173597,0.002111,0.002320,0.003909,0.000004,0.000005,0.000015,-0.752869,-0.561584,-0.158508,-0.766907,-0.575928,-0.185852,1
