In [74]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob


In [75]:
# Function to extract information from the folder path
def extract_info(folder_path):
    parts = folder_path.split('/')
    sensor = parts[1]
    participant = parts[2]
    language_tone = parts[-2].split('_')
    language = language_tone[0]
    tone = language_tone[1]
    script = language_tone[2].replace('csv', "")
    return sensor, language, tone, participant, script

# Function to sample the data
def sample(data, sampling_rate):
    # Convert 'Time (s)' to TimedeltaIndex
    data['Time (s)'] = pd.to_timedelta(data['Time (s)'], unit='s')
    
    # Drop duplicate indices to make the index unique
    data = data.drop_duplicates(subset='Time (s)')
    
    # Set 'Time (s)' as the index
    data.set_index('Time (s)', inplace=True)
    #change the 'Sounf pressure level (dB)' column name to 'amplitude'
    data.rename(columns={'Sound pressure level (dB)': 'amplitude'}, inplace=True)

    # Resample the data to the given interval and calculate mean, max, min, and median
    df_resampled = data.resample(f'{sampling_rate}S').agg({'amplitude': ['mean', 'max', 'min', 'median', 'std']})

    # Create a new column for seconds only and rename columns
    df_resampled[f'time_{sampling_rate}'] = df_resampled.index.total_seconds()
    df_resampled.columns = ['_'.join(col).strip() for col in df_resampled.columns.values]
    #delete the _ character at the end of the time column
    df_resampled.rename(columns={f'time_{sampling_rate}_': f'time_{sampling_rate}'}, inplace=True)

    # Reset the index to drop the original time column
    df_resampled.reset_index(drop=True, inplace=True)

   # Calculate the total number of missing values in the original amplitude column
    num_rows = int(1 / sampling_rate) # Number of rows combined for resampling
    missing_values_original = data['amplitude'].isnull().astype(int)
    df_resampled['amplitude_missing'] = missing_values_original.groupby(data.index // pd.Timedelta(seconds=sampling_rate)).sum()


    return df_resampled

In [76]:
sampled_df_list = []

# Path where the Amplitudes.csv files are located
path = 'data/autocorrelations'

# Loop through all the Amplitudes.csv files
for folder_path in glob.glob(path + '/*/*/Autocorrelations.csv'):
    # Extract information from the folder path
    sensor, language, tone, participant, script = extract_info(folder_path)
    
    # Read the CSV file
    df = pd.read_csv(folder_path)

 
    # Sample the data
    sampled_df = sample(df, 5)
    # Add new columns
    sampled_df['sensor'] = sensor
    sampled_df['language'] = language
    sampled_df['tone'] = tone
    sampled_df['participant'] = participant
    sampled_df['script'] = script
    # Append the sampled dataframe to the list
    sampled_df_list.append(sampled_df)

# Merge all sampled dataframes
merged_sampled_df = pd.concat(sampled_df_list, ignore_index=True)

ValueError: No objects to concatenate

In [73]:
merged_sampled_df.to_csv('amplitude_5.csv', index=False)

In [46]:
merged_sampled_df.head()

Unnamed: 0_level_0,amplitude,amplitude,amplitude,amplitude,amplitude,time_2,sensor,language,tone,participant,script
Unnamed: 0_level_1,mean,max,min,median,std,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-58.957404,-37.572976,-72.101199,-63.651947,11.563514,0.0,amplitudes,ch,casual,jack,t1
1,-44.40112,-32.319214,-63.679302,-43.422324,8.807858,2.0,amplitudes,ch,casual,jack,t1
2,-48.48565,-35.924024,-68.048485,-47.583204,11.981979,4.0,amplitudes,ch,casual,jack,t1
3,-43.263206,-35.03648,-61.156665,-42.789396,6.481694,6.0,amplitudes,ch,casual,jack,t1
4,-54.527828,-41.706485,-67.75106,-56.035887,9.13321,8.0,amplitudes,ch,casual,jack,t1
