# Dataset exploration

**Prereqs:** `conda activate ml_py310` then `pip install seaborn librosa ipywidgets`.


In [None]:
from pathlib import Path
import os

# auto-load variables from .env in repo root, if present
try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass   # falls back to bare os.environ

DATA_DIR = Path(os.getenv("DCASE_DATA", "Data/Dcase"))


In [None]:
DATA_DIR = Path(os.getenv("DCASE_DATA", "Data/Dcase"))


In [None]:
from pathlib import Path

def print_directory_tree(start_path, show_files=True):
    """
    Prints the directory tree structure starting from `start_path`, similar to the `tree` command.
    
    Args:
        start_path (str or Path): The root directory path to visualize.
        show_files (bool): Whether to list files in directories.
    Returns:
        None
    """

    def _walk(path, prefix=""):
        nonlocal file_count, dir_count
        contents = list(path.iterdir())
        
        if not show_files:
            contents = [p for p in contents if p.is_dir()]

        # Sort: directories first, then files
        contents.sort(key=lambda p: (not p.is_dir(), p.name))

        for i, path_entry in enumerate(contents):
            is_last = i == len(contents) - 1
            new_prefix = "└── " if is_last else "├── "
            print(f"{prefix}{new_prefix}{path_entry.name}")

            if path_entry.is_dir():
                dir_count += 1
                next_prefix = "    " if is_last else "│   "
                _walk(path_entry, prefix + next_prefix)
            elif show_files:
                file_count += 1

    # Initialize counters
    file_count = 0
    dir_count = 0

    start_path = Path(start_path)

    print(start_path)
    if not start_path.exists():
        print(f"  [Error: Path does not exist]")
        return

    _walk(start_path)

    # Print summary
    if show_files:
        print(f"\n{dir_count} directories, {file_count} files")
    else:
        print(f"\n{dir_count} directories")

print_directory_tree(DATA_DIR, show_files=False)

In [None]:
import os
from collections import Counter


# Count number of files per class
class_counts = Counter()
for root, dirs, files in os.walk(DATA_DIR):
    #if 'fold' in root:  # Only count train folds
        for file in files:
            if file.endswith('.wav'):
                class_name = os.path.basename(root)
                class_counts[class_name] += 1

print(class_counts.most_common())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.barplot(x=list(class_counts.values()), y=list(class_counts.keys()))
plt.title('Number of Audio Files Per Class')
plt.xlabel('Count')
plt.ylabel('Class')
plt.show()

# Listen to one example

In [None]:
from IPython.display import Audio
import random


machine = "ToyCar"
split    = "train"
wav_list = list(Path(DATA_DIR, machine, split).glob("*.wav"))
sample_path = random.choice(wav_list)
class_name  = machine

print(f"Class: {class_name}")
display(Audio(sample_path))

In [None]:
import librosa
import matplotlib.pyplot as plt

def plot_waveform(file_path):
    signal, sr = librosa.load(file_path, sr=None)
    plt.figure(figsize=(14, 4))
    plt.title(os.path.basename(file_path))
    plt.plot(signal)
    plt.xlabel("Sample")
    plt.ylabel("Amplitude")
    plt.show()

plot_waveform(sample_path)

In [None]:
def plot_mfcc(file_path):
    signal, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfccs, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
    plt.tight_layout()
    plt.show()

plot_mfcc(sample_path)

# Stats on files

In [None]:
import os
import soundfile as sf
import pandas as pd


durations = []

# Walk through each class folder
for class_folder in sorted(os.listdir(DATA_DIR)):
    class_path = os.path.join(DATA_DIR, class_folder)
    if not os.path.isdir(class_path):
        continue

    for split in ['train', 'test']:
        split_path = os.path.join(class_path, split)
        if not os.path.exists(split_path):
            continue

        for file in os.listdir(split_path):
            if file.endswith('.wav'):
                file_path = os.path.join(split_path, file)
                try:
                    signal, sr = sf.read(file_path)
                    duration = len(signal) / sr
                    durations.append({
                        'class': class_folder,
                        'split': split,
                        'duration': duration
                    })
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

In [None]:
df_durations = pd.DataFrame(durations)
print(df_durations.groupby(['class', 'split'])['duration'].agg(['mean', 'std', 'count']))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.boxplot(data=df_durations, x='class', y='duration', hue='split')
plt.xticks(rotation=45)
plt.title('Audio File Duration Distribution per Class and Split')
plt.ylabel('Duration (seconds)')
plt.xlabel('Class')
plt.legend(title='Split')
plt.tight_layout()
plt.show()

In [None]:
summary = df_durations.groupby(['class', 'split'])['duration'].agg(['mean', 'std', 'count'])
summary = summary.reset_index()
print(summary)

In [None]:
# Check variation within each class/split
print("\nUnique durations per class and split:")
for (cls, split), group in df_durations.groupby(['class', 'split']):
    unique_lengths = group['duration'].round(2).nunique()
    total_files = len(group)
    print(f"{cls} - {split}: {unique_lengths} unique durations out of {total_files} files")

In [None]:
import os
import librosa
import numpy as np

def extract_mfcc_features(dataset_path, n_mfcc=13, sr_target=None, verbose=True):
    """
    Traverse DCASE-style dataset structure and extract MFCC features.

    Folder structure expected:
        dataset_path/
            class1/
                train/
                    *.wav
                test/
                    *.wav
            class2/
                train/
                    *.wav
                test/
                    *.wav
            ...

    Args:
        dataset_path (str or Path): Path to root of dataset
        n_mfcc (int): Number of MFCC coefficients to extract
        sr_target (int or None): Target sample rate. If None, uses native rate.
        verbose (bool): Whether to print progress

    Returns:
        features (np.ndarray): Array of shape (n_samples, n_mfcc) containing MFCC features
        labels (list): List of corresponding class names
        splits (list): List indicating 'train' or 'test' for each file
    """
    features = []
    labels = []
    splits = []

    # Get list of class folders
    class_names = [d for d in os.listdir(dataset_path)
                   if os.path.isdir(os.path.join(dataset_path, d))]

    for class_name in sorted(class_names):
        class_dir = os.path.join(dataset_path, class_name)

        for split in ['train', 'test']:
            split_dir = os.path.join(class_dir, split)

            if not os.path.exists(split_dir):
                if verbose:
                    print(f"Missing {split} folder in {class_name}")
                continue

            if verbose:
                print(f"Processing {class_name}/{split}...")

            for file_name in os.listdir(split_dir):
                if file_name.endswith('.wav'):
                    file_path = os.path.join(split_dir, file_name)

                    try:
                        signal, sr = librosa.load(file_path, sr=sr_target)
                        mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

                        # Take mean over time to get single feature vector per file
                        features.append(np.mean(mfccs, axis=1))
                        labels.append(class_name)
                        splits.append(split)

                    except Exception as e:
                        if verbose:
                            print(f"❌ Error processing {file_path}: {e}")

    return np.array(features), labels, splits

In [None]:

# Extract features
features, labels, splits = extract_mfcc_features(DATA_DIR, n_mfcc=13, sr_target=None)

print("Features shape:", features.shape)
print("Labels count:", len(labels))

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# Reduce to 2D using t-SNE
tsne = TSNE(n_components=2, random_state=42)
features_2d = tsne.fit_transform(features)

# Visualize
sns.scatterplot(x=features_2d[:, 0], y=features_2d[:, 1], hue=labels, style=splits, palette='Set1')
plt.title('t-SNE of MFCCs by Class and Split')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()