In [2]:
import os
import time
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.backends.backend_agg import FigureCanvasAgg
from tqdm import tqdm
from PIL import Image
import parselmouth
import librosa
import soundfile as sf

# Define constants
DATASETS_PATH = r'D:\Documents\MASC\MSP_POD_dataset'

import numpy as np
import librosa
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from scipy.io import wavfile
from PIL import Image
import csv
from tqdm import tqdm
def clean_audio(path):
    y, sr = librosa.load(path)
    S_full, phase = librosa.magphase(librosa.stft(y))
    idx = slice(*librosa.time_to_frames([2, 6], sr=sr))
    width = int((S_full.shape[-1] - 1)/2)-1
    S_filter = librosa.decompose.nn_filter(S_full,
                                           aggregate=np.median,
                                           metric='cosine',
                                           width=width)
    S_filter = np.minimum(S_full, S_filter)
    margin_i, margin_v = 2, 10
    power = 2

    mask_i = librosa.util.softmask(S_filter,
                                   margin_i * (S_full - S_filter),
                                   power=power)

    mask_v = librosa.util.softmask(S_full - S_filter,
                                   margin_v * S_filter,
                                   power=power)

    S_foreground = mask_v * S_full

    sound = librosa.istft(S_foreground * phase)
    # sound = y
    # # # sf.write(os.path.join(new_dir,new_name), librosa.istft(S_foreground * phase), sr)
    return sound, sr
# Step 1: Create a Mel-Spectrogram
def create_mel_spectrogram(audio, sr):
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
    return mel_spec

# Step 2: Extract syllables (this is a simplified approach)
def extract_syllables(mel_spec):
    energy = np.sum(mel_spec, axis=0)
    peaks, _ = find_peaks(energy, distance=20)  # Adjust distance as needed
    if len(peaks) < 2:
        # If less than 2 peaks found, use start and end of the spectrogram
        peaks = np.array([0, mel_spec.shape[1] - 1])
    return peaks

# Step 3: Extract formants for a specific time range
def extract_formants_for_syllable(audio, sr, start_time, end_time):
    sound = parselmouth.Sound(audio, sampling_frequency=sr)
    syllable_sound = sound.extract_part(from_time=start_time, to_time=end_time)
    formant = syllable_sound.to_formant_burg()
    return formant

# Step 4: Plot formants and spectrogram for each syllable
def plot_syllable(fname, image_dir,mel_spec, sr, syl_val, syllable_start, syllable_end, formant):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(2.24, 2.24))
    
    # Plot spectrogram
    librosa.display.specshow(librosa.power_to_db(mel_spec[:, syllable_start:syllable_end], ref=np.max), ax=ax1)
    ax1.set_xticks([])
    ax1.set_yticks([])
    
    # Plot formants
    times = formant.xs()
    formant_values = []
    for t in times:
        for i in range(1, 4):
            value = formant.get_value_at_time(formant_number=i, time=t)
            if value is not None:
                formant_values.append(value)
    
    # Ensure times and formant_values have the same length
    times_repeated = np.repeat(times, 3)[:len(formant_values)]
    
    ax2.scatter(times_repeated, formant_values, s=1)
    ax2.set_ylim(0, 5000)
    ax2.set_xticks([])
    ax2.set_yticks([])
    jpname = f"{fname}_syl{syl_val}.png"
    save_path = os.path.join(image_dir, jpname)

    plt.tight_layout(pad=0)
    plt.savefig(save_path, dpi=100, bbox_inches='tight', pad_inches=0)
    plt.close()
    return jpname

# Main function to process the audio
def process_audio(audio_path, fname, image_dir):
    jpnames = []
    # Clean the audio
    audio, sr = clean_audio(audio_path)
    
    # Create mel spectrogram
    mel_spec = create_mel_spectrogram(audio, sr)
    
    # Extract syllables
    syllable_boundaries = extract_syllables(mel_spec)
    
    # Process each syllable

    for i in range(len(syllable_boundaries) - 1):
        start_frame = syllable_boundaries[i]
        end_frame = syllable_boundaries[i+1]
        
        # Convert frames to time
        start_time = librosa.frames_to_time(start_frame, sr=sr)
        end_time = librosa.frames_to_time(end_frame, sr=sr)
        
        # Extract formants for the syllable
        formant = extract_formants_for_syllable(audio, sr, start_time, end_time)
        
        # Plot the syllable
        jpname= plot_syllable(fname,image_dir, mel_spec, sr, i,start_frame, end_frame, formant)
        jpnames.append(jpname)
    return  jpnames 


def process_files(image_dir, csv_dir, ds=0):
    os.makedirs(image_dir, exist_ok=True)
    csv_file = os.path.join(csv_dir, 'file_labels.csv')

    existing_files = set()
    if os.path.exists(csv_file):
        with open(csv_file, 'r', newline='') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                existing_files.add(row['file_name'])



    for filename in os.listdir(csv_dir):
        if filename.endswith('.wav'):  # Check for .wav extension
            file_path = os.path.join(csv_dir, filename)
            if os.path.isfile(file_path):  # Ensure it's a file
                fname = os.path.basename(file_path)
                jpnames =  process_audio(file_path, fname, image_dir)


    # with open(csv_file, 'a', newline='') as csvfile:
    #     fieldnames = ['file_name', 'full_path', 'speaker', 'full_path','arousal', 'valence', '']
    #     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    #     if os.path.getsize(csv_file) == 0:
    #         writer.writeheader()

    #     total_new_images = 0
    #     for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing audio files"):
    #         file_path = row['filename']
    #         file_name = row['fname']
    #         label_val = row['label']
    #         speaker = row['speaker']
    #         arousal = row['act']
    #         valence = row['val']
    #         domination = row['']
            
    #         if ds == 0:
    #             fname = str(file_name.split('.wav')[0])
    #         else:
    #             indval = file_path.rfind('/', 0, file_path.rfind('/'))
    #             vid_utter = file_path[indval+1:]
    #             fname = vid_utter.replace("/", "_")
            
    #         jpnames =  process_audio(file_path, fname, image_dir)

    #         for jpname in jpnames:
    #             full__path = os.path.join(image_dir, jpname)
    #             writer.writerow({
    #                 'file_name': jpname,
    #                 'label': label_val,
    #                 'speaker': speaker,
    #                 'full_path': full__path,
    #                 'arousal': arousal,
    #                 'valence': valence,
    #                 'domination' : domination,
    #             })
    

def main():
    datasets = {
        1: ("CMU MOSEI", '/home/carol/Documents/Emo_rec/CSV_FILES/CMUmini_data.csv', '/media/carol/Data/DATASETS/SavedSets002/CMUmini'),
        2: ("CREMA-D", '/home/carol/Documents/Emo_rec/CSV_FILES/CREMA_data.csv', '/media/carol/Data/DATASETS/SavedSets002/CREMA'),
        3: ("EMOV-DB", '/home/carol/Documents/Emo_rec/CSV_FILES/EMOV_data.csv', '/media/carol/Data/DATASETS/SavedSets002/EMOV'),
        4: ("MSP IMPROV", '/media/carol/Data/Documents/Emo_rec/CSV_FILES/MSPIMPROV_data2.csv', '/media/carol/Data/DATASETS/SavedSets002/MSPI_SYL'),
        5: ("RAVDESS-DB", '/home/carol/Documents/Emo_rec/CSV_FILES/RAVDESS_data.csv', '/media/carol/Data/DATASETS/SavedSets002/Archive/RAVDESS'),
        6: ("TESS-DB", '/home/carol/Documents/Emo_rec/CSV_FILES/TESS_data.csv', '/media/carol/Data/DATASETS/SavedSets002/Archive/TESS'),
        7: ("VIVAE-DB", '/home/carol/Documents/Emo_rec/CSV_FILES/VIVAE_data.csv', '/media/carol/Data/DATASETS/SavedSets002/Archive/VIVAE'),
        8: ("IEMOCAP", '/media/carol/Data/Documents/Emo_rec/CSV_FILES/IEMOCAP_data_Full.csv', '/media/carol/Data/DATASETS/SavedSets002/IEMOCAP_SYL'),
        9: ("ASVP-ESD", '/home/carol/Documents/Emo_rec/CSV_FILES/ASVP_data.csv', '/media/carol/Data/DATASETS/SavedSets002/Archive/ASVP'),
        10: ("OMG", '/media/carol/Data/Documents/Emo_rec/CSV_FILES/OMG_Train.csv', '/media/carol/Data/DATASETS/SavedSets002/OMG_Train2'),
        11: ("MSP_POD", '/media/carol/Data/Documents/Emo_rec/CSV_FILES/MSPpod_data.csv', '/media/carol/Data/DATASETS/SavedSets002/MSP_POD_MEL'),
        12:("MSP_POD_2", r'D:\Documents\MASC\MSP_POD_dataset\Audios\Audios.tar\test', r'D:\Documents\MASC\MSP_POD_dataset\Images_Syllables_TEST')
    }

    print("Available datasets:")
    for key, (name, _, _) in datasets.items():
        print(f"{key}. {name}")

    dataset_choice = int(input("Enter the number of the dataset you would like to run: "))

    if dataset_choice in datasets:
        name, file_path, output_dir = datasets[dataset_choice]
        # print(f"--------------{name} DATASET STARTED ---- ")

        # df = pd.read_csv(file_path)
        # image_dir = os.path.join(output_dir, 'images')
        # os.makedirs(image_dir, exist_ok=True)
        # os.makedirs(output_dir, exist_ok=True)

        process_files(output_dir, file_path, dataset_choice)
    else:
        print("Invalid dataset choice.")

if __name__ == "__main__":
    main()

Available datasets:
1. CMU MOSEI
2. CREMA-D
3. EMOV-DB
4. MSP IMPROV
5. RAVDESS-DB
6. TESS-DB
7. VIVAE-DB
8. IEMOCAP
9. ASVP-ESD
10. OMG
11. MSP_POD
12. MSP_POD_2


In [3]:
import os
import pandas as pd

# Define paths
image_folder = r"D:\Documents\MASC\MSP_POD_dataset\Images_Syllables_TEST"  # Replace with the path to your folder of .png files
existing_csv_path = r"D:\Documents\MASC\MSP_POD_dataset\Audios\Audios.tar\test\metadata.csv"  # Replace with the path to your existing CSV
output_csv_path = os.path.join(image_folder, "metadata.csv") # Replace with the path to save the final CSV

# Load the existing CSV
existing_csv = pd.read_csv(existing_csv_path)

# Create a mapping from wav_filename to transcript
wav_to_transcript = dict(zip(existing_csv['file_name'], existing_csv['transcript']))

# List all .png files in the folder
png_files = [f for f in os.listdir(image_folder) if f.endswith('.png')]

# Prepare the final data
data = []
for png_file in png_files:
    # Extract the .wav file name from the .png file name
    wav_filename = png_file.split('.')[0] + '.wav'
    extracted_number = png_file.split('_')[-1].split('.')[0]
    # Get the mapped transcript
    transcript = wav_to_transcript.get(wav_filename, "Transcript not found")
    
    # Append to the data list
    data.append({
        "file_name": png_file,
        "file": wav_filename,
        "syllable": extracted_number,
        "Transcript": transcript
    })

# Create a DataFrame and save it as CSV
output_df = pd.DataFrame(data)
output_df.to_csv(output_csv_path, index=False)

print(f"CSV created successfully and saved to {output_csv_path}")


CSV created successfully and saved to D:\Documents\MASC\MSP_POD_dataset\Images_Syllables_TEST\metadata.csv


In [6]:
import datasets
from datasets import load_dataset

# Load the image folder
dataset = load_dataset("imagefolder", data_dir=image_folder)
dataset.push_to_hub("MSPP_TEST_SYL")

Downloading data: 100%|██████████| 26249/26249 [00:01<00:00, 14893.98files/s]
Generating test split: 26248 examples [00:02, 9892.92 examples/s] 
Map: 100%|██████████| 26248/26248 [01:56<00:00, 225.85 examples/s]]
Creating parquet from Arrow format: 100%|██████████| 263/263 [00:00<00:00, 310.40ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [03:53<00:00, 233.62s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/cairocode/MSPP_TEST_SYL/commit/0d038efa9bf2ae8fd96b2b2db5af342ef2e0f8dd', commit_message='Upload dataset', commit_description='', oid='0d038efa9bf2ae8fd96b2b2db5af342ef2e0f8dd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cairocode/MSPP_TEST_SYL', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cairocode/MSPP_TEST_SYL'), pr_revision=None, pr_num=None)

In [7]:
ds = load_dataset("cairocode/MSP_Pod_SYL6")

Generating train split: 100%|██████████| 333683/333683 [00:03<00:00, 88749.44 examples/s] 
Generating validation split: 100%|██████████| 83421/83421 [00:00<00:00, 90657.58 examples/s]
Generating test split: 100%|██████████| 83421/83421 [00:00<00:00, 93817.50 examples/s] 


In [9]:
from datasets import concatenate_datasets
combined_dataset = concatenate_datasets([ds['train'], ds['test'], ds['validation']])


In [10]:
import re
def extract_wav_and_syl(example):
    # Extract wav_filename using regex
    wav_match = re.search(r'(?<=Audios_)(.*?\.wav)', example['full_path'])
    syl_match = re.search(r'_(\d+)\.wav', example['full_path'])
    
    # Assign extracted values or None if no match
    example['wav_filename'] = wav_match.group(1) if wav_match else None
    example['syl_number'] = syl_match.group(1) if syl_match else None
    return example

# Map the function to the dataset
updated_dataset = combined_dataset.map(extract_wav_and_syl)

Map: 100%|██████████| 500525/500525 [00:37<00:00, 13463.33 examples/s]


In [11]:
updated_dataset[0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=224x223>,
 'label': 1,
 'speaker': 1,
 'full_path': 'C:\\Users\\Paolo\\Documents\\carol_emo_rec\\DATASETS\\Image_Sets\\MSP_POD_SYL\\images\\Audios_MSP-PODCAST_0003_0360.wav_syl1.png',
 'arousal': 6.076923,
 'valence': 5.846154,
 'wav_filename': 'MSP-PODCAST_0003_0360.wav',
 'syl_number': '0360'}

In [13]:
import pandas as pd

# Path to your CSV file
csv_path = r"D:\Documents\MASC\MSP_POD_dataset\Audios\Audios.tar\Audios\metadata.csv"

# Load CSV into a DataFrame
df = pd.read_csv(csv_path)

print(df.head())


                   file_name EmoClass  EmoAct  EmoVal  EmoDom  SpkrID  Gender  \
0  MSP-PODCAST_0002_0033.wav        N     4.8     4.2     5.4     127  Female   
1  MSP-PODCAST_0002_0039.wav        N     4.0     4.2     4.2     127  Female   
2  MSP-PODCAST_0002_0051.wav        N     4.0     4.2     4.2     127  Female   
3  MSP-PODCAST_0002_0059.wav        X     4.0     3.8     4.0     128  Female   
4  MSP-PODCAST_0002_0061.wav        F     3.4     2.8     4.2     128  Female   

     Split_Set                                         transcript  
0  Development               and i mean the numbers. right? so...  
1  Development  15 hundred or something. it talks about the tw...  
2  Development  [inaudible 00:03:13] so here it is. so it's pa...  
3  Development  no. it sounds like it could be like a bourne i...  
4  Development  yeah. so, but molly, i mean, this was really o...  


In [16]:
def add_csv_info(example):
    matching_row = df[df['file_name'] == example['wav_filename']]
    if not matching_row.empty:
        for col in df.columns:
            if col != 'file_name':  # Skip the file_name column
                value = matching_row.iloc[0][col]
                if pd.isnull(value):  # Handle missing values
                    example[col] = None
                elif isinstance(value, (int, float)):  # Keep numbers as is
                    example[col] = value
                else:  # Convert everything else to string
                    example[col] = str(value)
    else:
        for col in df.columns:
            if col != 'file_name':
                example[col] = None  # Default value for no match
    return example


# Map the function to the dataset
updated_dataset2 = updated_dataset.map(add_csv_info)

# Inspect the updated dataset
print(updated_dataset)


Map: 100%|██████████| 500525/500525 [43:57<00:00, 189.77 examples/s]

Dataset({
    features: ['image', 'label', 'speaker', 'full_path', 'arousal', 'valence', 'wav_filename', 'syl_number'],
    num_rows: 500525
})





In [17]:
print(updated_dataset2)

Dataset({
    features: ['image', 'label', 'speaker', 'full_path', 'arousal', 'valence', 'wav_filename', 'syl_number', 'EmoClass', 'EmoAct', 'EmoVal', 'EmoDom', 'SpkrID', 'Gender', 'Split_Set', 'transcript'],
    num_rows: 500525
})


In [18]:
updated_dataset2.push_to_hub("MSPP_SYL_FULL")

Map: 100%|██████████| 41711/41711 [00:02<00:00, 17457.57 examples/s]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:01<00:00, 220.30ba/s]
Map: 100%|██████████| 41711/41711 [00:02<00:00, 15252.48 examples/s]0.47s/it]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:02<00:00, 171.22ba/s]
Map: 100%|██████████| 41711/41711 [00:02<00:00, 16320.61 examples/s]6.61s/it]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:02<00:00, 183.10ba/s]
Map: 100%|██████████| 41711/41711 [00:01<00:00, 21977.98 examples/s]0.39s/it]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:01<00:00, 230.62ba/s]
Map: 100%|██████████| 41711/41711 [00:01<00:00, 31802.82 examples/s]1.20s/it]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:01<00:00, 253.54ba/s]
Map: 100%|██████████| 41710/41710 [00:01<00:00, 30953.27 examples/s]2.37s/it]
Creating parquet from Arrow format: 100%|██████████| 418/418 [00:01<00:00, 244.56ba/s]
Map: 100%|█████████

CommitInfo(commit_url='https://huggingface.co/datasets/cairocode/MSPP_SYL_FULL/commit/b1adc3eb44b29d60076a5f284d23c982036464a2', commit_message='Upload dataset', commit_description='', oid='b1adc3eb44b29d60076a5f284d23c982036464a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cairocode/MSPP_SYL_FULL', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cairocode/MSPP_SYL_FULL'), pr_revision=None, pr_num=None)