In [1]:
import os
import zipfile
import shutil

def unzip_and_archive(base_folder):
    archive_folder = os.path.join(base_folder, "archive")  # Archive folder

    # Create archive folder if it doesn't exist
    os.makedirs(archive_folder, exist_ok=True)

    for root, _, files in os.walk(base_folder):
        for file in files:
            if file.endswith('.zip'):
                zip_path = os.path.join(root, file)
                extract_folder = os.path.join(root, os.path.splitext(file)[0])  # Extract into a folder with same name
                
                if not os.path.exists(extract_folder):  # Avoid re-extracting if already done
                    os.makedirs(extract_folder, exist_ok=True)
                    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                        zip_ref.extractall(extract_folder)
                    print(f"Extracted: {zip_path} -> {extract_folder}")

                # Move the zip file to the archive folder
                archive_path = os.path.join(archive_folder, file)
                shutil.move(zip_path, archive_path)
                print(f"Moved: {zip_path} -> {archive_path}")

base_folder = "D:\Documents\MASC\EMOV_DB"  # Change this to your target directory
unzip_and_archive(base_folder)


  base_folder = "D:\Documents\MASC\EMOV_DB"  # Change this to your target directory


Extracted: D:\Documents\MASC\EMOV_DB\bea\amused.zip -> D:\Documents\MASC\EMOV_DB\bea\amused
Moved: D:\Documents\MASC\EMOV_DB\bea\amused.zip -> D:\Documents\MASC\EMOV_DB\archive\amused.zip
Extracted: D:\Documents\MASC\EMOV_DB\bea\anger.zip -> D:\Documents\MASC\EMOV_DB\bea\anger
Moved: D:\Documents\MASC\EMOV_DB\bea\anger.zip -> D:\Documents\MASC\EMOV_DB\archive\anger.zip
Extracted: D:\Documents\MASC\EMOV_DB\bea\neutral.zip -> D:\Documents\MASC\EMOV_DB\bea\neutral
Moved: D:\Documents\MASC\EMOV_DB\bea\neutral.zip -> D:\Documents\MASC\EMOV_DB\archive\neutral.zip
Extracted: D:\Documents\MASC\EMOV_DB\bea\sleepiness.zip -> D:\Documents\MASC\EMOV_DB\bea\sleepiness
Moved: D:\Documents\MASC\EMOV_DB\bea\sleepiness.zip -> D:\Documents\MASC\EMOV_DB\archive\sleepiness.zip
Extracted: D:\Documents\MASC\EMOV_DB\jenie\amused.zip -> D:\Documents\MASC\EMOV_DB\jenie\amused
Moved: D:\Documents\MASC\EMOV_DB\jenie\amused.zip -> D:\Documents\MASC\EMOV_DB\archive\amused.zip
Extracted: D:\Documents\MASC\EMOV_DB\j

In [3]:
import os
import shutil

def organize_wav_files(base_folder, destination_folder):
    os.makedirs(destination_folder, exist_ok=True)  # Ensure the destination folder exists

    for root, _, files in os.walk(base_folder):
        # Extract speaker name from the first level subdirectory
        parts = os.path.relpath(root, base_folder).split(os.sep)
        if len(parts) < 1:
            continue  # Skip if no speaker folder found

        speaker = parts[0]  # The speaker name is the first folder

        for file in files:
            if file.endswith('.wav'):
                original_path = os.path.join(root, file)
                new_filename = f"{speaker}_{file}"
                new_path = os.path.join(destination_folder, new_filename)

                shutil.copy2(original_path, new_path)  # Copy with metadata
                print(f"Copied: {original_path} -> {new_path}")

base_folder = r"D:\Documents\MASC\EMOV_DB\Original Folders"  # Replace with the actual path
destination_folder = r"D:\Documents\MASC\EMOV_DB\New_Folders"  # Replace with your target directory
organize_wav_files(base_folder, destination_folder)


Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15_0001.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15_0001.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15_0002.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15_0002.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15_0003.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15_0003.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15_0004.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15_0004.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amused_1-15_0005.wav -> D:\Documents\MASC\EMOV_DB\New_Folders\bea_amused_1-15_0005.wav
Copied: D:\Documents\MASC\EMOV_DB\Original Folders\bea\amused\amused\amuse

In [3]:
import os
import pandas as pd

# Define the first emotion mapping (emotion_id)
emotion_id_map = {
    "neutral": 0,
    "amused": 1,
    "anger": 3,
    "disgust": 5,
    "sleepiness": 9
}

# Define the second emotion mapping (EMOV_Labels)
emov_labels_map = {
    "neutral": 0,
    "amused": 1,
    "anger": 2,
    "disgust": 3,
    "sleepiness": 4
}

def generate_metadata(flat_directory, output_csv):
    metadata = []

    for file in os.listdir(flat_directory):
        if file.endswith(".wav"):
            parts = file.split("_")  # Split filename using underscores
            if len(parts) >= 3:  # Ensure there are enough parts
                emotion = parts[1]  # Extract emotion (3rd part in 0-based index)
                print(emotion)
                # Map to emotion_id and EMOV_Labels
                emotion_id = emotion_id_map.get(emotion, -1)  # Default to -1 if not found
                emov_label = emov_labels_map.get(emotion, -1)  # Default to -1 if not found

                metadata.append({
                    "file_name": file,
                    "emotion": emotion,
                    "emotion_id": emotion_id,
                    "EMOV_Labels": emov_label
                })

    # Convert to DataFrame and save as CSV
    df = pd.DataFrame(metadata)
    df.to_csv(output_csv, index=False)
    print(f"Metadata saved to {output_csv}")

# Set paths
flat_directory = r"D:\Documents\MASC\EMOV_DB\New_Folders" # Directory where all WAV files are copied
output_csv = os.path.join(flat_directory, "metadata2.csv")  # Save metadata in same directory

# Generate metadata
generate_metadata(flat_directory, output_csv)


amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused
amused

In [3]:
import pandas as pd

# Load the final metadata CSV
metadata = pd.read_csv(r"D:\Documents\MASC\EMOV_DB\New_Folders\metadata.csv")

# Define the first emotion mapping (emotion_id)
emotion_id_map = {
    "neutral": 0,
    "amused": 1,
    "anger": 3,
    "disgust": 5,
    "sleepiness": 9
}

# Define the second emotion mapping (EMOV_Labels)
emov_labels_map = {
    "neutral": 0,
    "amused": 1,
    "anger": 2,
    "disgust": 3,
    "sleepiness": 4
}

# Ensure 'emotion' column is lowercase for case insensitivity
metadata["emotion"] = metadata["emotion"].str.lower()

# Map emotions to respective IDs
metadata["emotion_id"] = metadata["emotion"].map(emotion_id_map)
metadata["EMOV_Labels"] = metadata["emotion"].map(emov_labels_map)

# Save the updated metadata CSV
metadata.to_csv(r"D:\Documents\MASC\EMOV_DB\New_Folders\metadata.csv", index=False)

print(r"Emotion mappings applied successfully and saved as D:\Documents\MASC\EMOV_DB\New_Folders\metadata.csv")


Emotion mappings applied successfully and saved as D:\Documents\MASC\EMOV_DB\New_Folders\metadata.csv


In [4]:
import pandas as pd

# Load the metadata CSV
flat_directory = r"D:\Documents\MASC\EMOV_DB\New_Folders" # Directory where all WAV files are copied

metadata = pd.read_csv(os.path.join(flat_directory, "metadata2.csv"))

# Load the transcript CSV (adjust file name as needed)
transcript_data = pd.read_csv(os.path.join(flat_directory, "metadata.csv"))

# Merge based on 'file_name', keeping only the transcript column from transcript_data
merged_data = metadata.merge(transcript_data[['file_name', 'transcript']], on="file_name", how="left")

# Save the updated metadata CSV
merged_data.to_csv((os.path.join(flat_directory, "metadata3.csv")), index=False)

print("Transcript column merged successfully into metadata.csv and saved as updated_metadata.csv")


Transcript column merged successfully into metadata.csv and saved as updated_metadata.csv


In [5]:
import pandas as pd

# Load the updated metadata CSV
metadata = pd.read_csv(os.path.join(flat_directory, "metadata.csv"))

# Extract the first three letters of file_name and create "speaker_name" column
metadata["speaker_name"] = metadata["file_name"].str[:3]

# Map unique speaker names to integers
speaker_mapping = {name: idx for idx, name in enumerate(metadata["speaker_name"].unique())}
metadata["speaker_ID"] = metadata["speaker_name"].map(speaker_mapping)

# Save the updated metadata with speaker info
metadata.to_csv(os.path.join(flat_directory, "metadata.csv"), index=False)

print("Speaker names and IDs added successfully. Saved as final_metadata.csv")


Speaker names and IDs added successfully. Saved as final_metadata.csv


In [1]:
from datasets import load_dataset


dirname = r"D:\Documents\MASC\EMOV_DB\New_Folders"

dataset = load_dataset("audiofolder", data_dir=dirname)
dataset.push_to_hub("EMOV_WAV")

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 6654/6654 [00:00<00:00, 10771.48files/s]
Generating train split: 6653 examples [00:00, 9156.32 examples/s] 
Map: 100%|██████████| 215/215 [00:04<00:00, 52.14 examples/s] ?it/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  1.60ba/s]
Map: 100%|██████████| 215/215 [00:03<00:00, 54.95 examples/s]:41, 71.39s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  1.60ba/s]
Map: 100%|██████████| 215/215 [00:03<00:00, 58.75 examples/s]:44, 69.83s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  2.03ba/s]
Map: 100%|██████████| 215/215 [00:04<00:00, 53.08 examples/s]:56, 66.29s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  1.77ba/s]
Map: 100%|██████████| 215/215 [00:04<00:00, 45.79 examples/s]:38, 65.85s/it]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:01<00:00,  1.68ba/s]
Map: 100%|██████████

CommitInfo(commit_url='https://huggingface.co/datasets/cairocode/EMOV_WAV/commit/017910a21a86a9d1ea326a0e43401b0e9745cdb4', commit_message='Upload dataset', commit_description='', oid='017910a21a86a9d1ea326a0e43401b0e9745cdb4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cairocode/EMOV_WAV', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cairocode/EMOV_WAV'), pr_revision=None, pr_num=None)