In [42]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import glob


In [64]:
# Function to extract information from the folder path
def extract_info(folder_path):
    parts = folder_path.split('/')
    sensor = parts[1]
    participant = parts[2]
    language_tone = parts[-2].split('_')
    language = language_tone[0]
    tone = language_tone[1]
    script = language_tone[2].replace('csv', "")
    return sensor, language, tone, participant, script

# Function to sample the data
def sample(data, sampling_rate):
    # Convert 'Time (s)' to TimedeltaIndex
    data['Time (s)'] = pd.to_timedelta(data['Time (s)'], unit='s')
    
    # Drop duplicate indices to make the index unique
    data = data.drop_duplicates(subset='Time (s)')
    
    # Set 'Time (s)' as the index
    data.set_index('Time (s)', inplace=True)
    
    # Rename the 'Sound pressure level (dB)' column to 'amplitude'
    data.rename(columns={'Sound pressure level (dB)': 'amplitude'}, inplace=True)
    
    # Resample the data to the given interval and calculate mean, max, min, and median
    df_resampled = data.resample(f'{sampling_rate}S').agg({'amplitude': ['mean', 'max', 'min', 'median']})
    
    # Calculate standard deviation using a sliding window
    data['amplitude_std'] = data['amplitude'].rolling(window=int(1/sampling_rate), min_periods=1).std()

    # Resample the standard deviation separately
    df_std_resampled = data.resample(f'{sampling_rate}S').agg({'amplitude_std': 'mean'})
    
    # Combine the standard deviation with other resampled data
    df_resampled = df_resampled.join(df_std_resampled)
    
    # Create a new column for seconds only
    df_resampled[f'time_{sampling_rate}'] = df_resampled.index.total_seconds()
    
    # Flatten the MultiIndex columns
    df_resampled.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_resampled.columns.values]
    
    # Rename the columns to remove trailing underscores and correct names
    df_resampled = df_resampled.rename(columns={
        'amplitude_mean': 'amplitude_mean',
        'amplitude_max': 'amplitude_max',
        'amplitude_min': 'amplitude_min',
        'amplitude_median': 'amplitude_median',
        'amplitude_std': 'amplitude_std',
        f'time_{sampling_rate}_': f'time_{sampling_rate}'
    })
    
    # Reset the index to drop the original time column
    df_resampled.reset_index(drop=True, inplace=True)

    # Calculate the total number of missing values in the original amplitude column
    missing_values_original = data['amplitude'].isnull().astype(int)
    df_resampled['amplitude_missing'] = missing_values_original.groupby(data.index // pd.Timedelta(seconds=sampling_rate)).sum()

    return df_resampled

In [65]:
sampled_df_list = []

# Path where the Amplitudes.csv files are located
path = 'data/amplitudes'

# Loop through all the Amplitudes.csv files excluding 'duru' and 'nga' folders
for folder_path in glob.glob(path + '/*/*/Amplitudes.csv'):
    if 'data/amplitudes/duru/' in folder_path or 'data/amplitudes/nga/' in folder_path:
        continue  # Skip processing for these folders
    # Extract information from the folder path
    sensor, language, tone, participant, script = extract_info(folder_path)
    df = pd.read_csv(folder_path)
 
    # Sample the data
    sampled_df = sample(df, 0.1)
    # Add new columns
    sampled_df['sensor'] = sensor
    sampled_df['language'] = language
    sampled_df['tone'] = tone
    sampled_df['participant'] = participant
    sampled_df['script'] = script
    # Append the sampled dataframe to the list
    sampled_df_list.append(sampled_df)

# Merge all sampled dataframes
merged_sampled_df = pd.concat(sampled_df_list, ignore_index=True)

  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resampled)
  df_resampled = df_resampled.join(df_std_resa

In [66]:
#sort the dataframe by participant and script
merged_sampled_df.sort_values(by=['participant', 'script'], inplace=True)


In [67]:
merged_sampled_df.to_csv('amplitude_0.1.csv', index=False)

In [68]:
languages = merged_sampled_df['language'].unique()
print(languages)

['ch' 'en' 'ru' 'tr']
