In [1]:
import os
import io
import pandas as pd
import numpy as np
import soundfile as sf
import torchaudio
import torchaudio.functional as taF
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

CONFIG = {
    'min_duration': 0.5,
    'max_duration': 15.0,
    'target_sr': 16000,
    'random_seed': 42,
}

EMOTION_CLASSES = ['neutral', 'happy', 'sad', 'angry']

# Dataset paths
D1_VISC_PATH = "ViSEC"  # sau khi download từ HuggingFace
D2_PATH = "d2"
D3_PATH = "d3"

# Output paths
OUTPUT_DIR = "data"
TRAIN_PHASE1 = f"{OUTPUT_DIR}/train/phase1"
TRAIN_PHASE2 = f"{OUTPUT_DIR}/train/phase2"
VAL_DIR = f"{OUTPUT_DIR}/val"
TEST_DIR = f"{OUTPUT_DIR}/test"


In [2]:
print("="*60)
print("STEP 1: LOAD D1 (ViSEC Dataset)")
print("="*60)

from huggingface_hub import hf_hub_download

local_path = hf_hub_download(
    repo_id="hustep-lab/ViSEC",
    repo_type="dataset",
    filename="data/train-00000-of-00001.parquet",
    local_dir="ViSEC"
)

df_d1 = pd.read_parquet(local_path)
print(f"Loaded D1 shape: {df_d1.shape}")
print(f"Columns: {df_d1.columns.tolist()}")
print(df_d1.head())


STEP 1: LOAD D1 (ViSEC Dataset)
Loaded D1 shape: (5280, 7)
Columns: ['speaker_id', 'path', 'duration', 'accent', 'emotion', 'emotion_id', 'gender']
   speaker_id                                               path  duration  \
0           0  {'bytes': b'RIFF\xa69\x01\x00WAVEfmt \x10\x00\...  2.508062   
1           1  {'bytes': b'RIFFF\xc8\x00\x00WAVEfmt \x10\x00\...  1.600000   
2           2  {'bytes': b'RIFFF\xb8\x00\x00WAVEfmt \x10\x00\...  1.472000   
3           3  {'bytes': b'RIFFF\xc8\x01\x00WAVEfmt \x10\x00\...  3.648000   
4           4  {'bytes': b'RIFFF\xc0\x01\x00WAVEfmt \x10\x00\...  3.584000   

  accent  emotion  emotion_id  gender  
0  south    happy           0  female  
1  south  neutral           1    male  
2  south    angry           3  female  
3  north    happy           0  female  
4  south  neutral           1    male  


In [4]:
print("\n" + "="*60)
print("STEP 2: PROCESS D1 - Decode & Validate Audio")
print("="*60)

# Create temp directory for D1 audio
d1_audio_dir = "temp_d1_audio"
os.makedirs(d1_audio_dir, exist_ok=True)

def decode_d1_audio(row, idx):
    """Decode D1 audio from bytes"""
    try:
        raw = row["path"]["bytes"]
        audio, sr = sf.read(io.BytesIO(raw))
        audio = audio.astype(np.float32)
        
        # Resample if needed
        if sr != 16000:
            audio_tensor = torchaudio.transforms.Resample(sr, 16000)(
                torch.from_numpy(audio).unsqueeze(0)
            )
            audio = audio_tensor.squeeze(0).numpy()
            sr = 16000
        
        # Calculate duration
        duration = len(audio) / sr
        
        # Save WAV
        filepath = os.path.join(d1_audio_dir, f"d1_{idx}.wav")
        sf.write(filepath, audio, sr)
        
        return filepath, duration, True
    except Exception as e:
        print(f"Error processing D1 sample {idx}: {e}")
        return None, None, False

print("Decoding D1 audio files...")
results = []
for idx in tqdm(range(len(df_d1))):
    filepath, duration, success = decode_d1_audio(df_d1.iloc[idx], idx)
    results.append({
        'audio_path': filepath,
        'duration': duration,
        'success': success
    })

df_d1['audio_path'] = [r['audio_path'] for r in results]
df_d1['duration'] = [r['duration'] for r in results]
df_d1['success'] = [r['success'] for r in results]

df_d1 = df_d1[df_d1['success'] == True].reset_index(drop=True)

print(f"D1 after decode: {len(df_d1)} files")

# Map emotion_id to emotion_name
emotion_map = {0: 'neutral', 1: 'happy', 2: 'sad', 3: 'angry'}
df_d1['emotion'] = df_d1['emotion_id'].map(emotion_map)



STEP 2: PROCESS D1 - Decode & Validate Audio
Decoding D1 audio files...


100%|██████████| 5280/5280 [00:11<00:00, 470.09it/s]

D1 after decode: 5280 files





In [5]:
print("\n" + "="*60)
print("STEP 3: FILTER VALID AUDIO (0.5–15s)")
print("="*60)

MIN_DUR = CONFIG['min_duration']
MAX_DUR = CONFIG['max_duration']

df_d1_valid = df_d1[
    (df_d1['duration'] >= MIN_DUR) & 
    (df_d1['duration'] <= MAX_DUR)
].reset_index(drop=True)

print(f"D1 -> Remaining {len(df_d1_valid)} files (from {len(df_d1)})")
print(f"Duration range: {df_d1_valid['duration'].min():.2f}s - {df_d1_valid['duration'].max():.2f}s")
print(f"Emotion distribution:\n{df_d1_valid['emotion'].value_counts()}")


STEP 3: FILTER VALID AUDIO (0.5–15s)
D1 -> Remaining 5277 files (from 5280)
Duration range: 1.00s - 13.50s
Emotion distribution:
emotion
happy      1506
angry      1466
neutral    1226
sad        1079
Name: count, dtype: int64


In [10]:
import torch

In [11]:
def load_folder_dataset(dataset_path, dataset_name, out_dir):
    """
    Load dataset from folder/emotion/*.wav
    -> decode, resample, save WAV chuẩn 16kHz
    """
    data = []
    os.makedirs(out_dir, exist_ok=True)

    for emotion in EMOTION_CLASSES:
        emotion_dir = os.path.join(dataset_path, emotion)
        if not os.path.isdir(emotion_dir):
            continue

        emotion_out_dir = os.path.join(out_dir, emotion)
        os.makedirs(emotion_out_dir, exist_ok=True)

        for filename in os.listdir(emotion_dir):
            if not filename.lower().endswith(".wav"):
                continue

            in_path = os.path.join(emotion_dir, filename)

            try:
                # 1) Load bằng soundfile (KHÔNG torchcodec)
                audio, sr = sf.read(in_path, dtype="float32")

                # 2) Mono
                if audio.ndim > 1:
                    audio = audio.mean(axis=1)

                # 3) Resample
                if sr != 16000:
                    audio = taF.resample(
                        torch.from_numpy(audio), sr, 16000
                    ).numpy()
                    sr = 16000

                # 4) Duration
                duration = len(audio) / sr

                # 5) Save WAV chuẩn
                out_path = os.path.join(
                    emotion_out_dir,
                    f"{dataset_name}_{filename}"
                )
                sf.write(out_path, audio, sr)

                data.append({
                    "audio_path": out_path,
                    "emotion": emotion,
                    "duration": duration,
                    "dataset": dataset_name,
                    "success": True
                })

            except Exception as e:
                print(f"Error loading {in_path}: {e}")

    return pd.DataFrame(data)
print("\nLoading D2...")
df_d2 = load_folder_dataset(D2_PATH, "D2", "temp_d2_audio")
print(f"D2 loaded: {len(df_d2)}")

print("\nLoading D3...")
df_d3 = load_folder_dataset(D3_PATH, "D3", "temp_d3_audio")
print(f"D3 loaded: {len(df_d3)}")


Loading D2...
D2 loaded: 56

Loading D3...
D3 loaded: 177


In [12]:
print("\n" + "="*60)
print("STEP 5: FILTER D2 & D3 BY DURATION")
print("="*60)

df_d2_valid = df_d2[
    (df_d2['duration'] >= MIN_DUR) & 
    (df_d2['duration'] <= MAX_DUR)
].reset_index(drop=True)

df_d3_valid = df_d3[
    (df_d3['duration'] >= MIN_DUR) & 
    (df_d3['duration'] <= MAX_DUR)
].reset_index(drop=True)

print(f"D2 -> Remaining {len(df_d2_valid)} files (from {len(df_d2)})")
print(f"D3 -> Remaining {len(df_d3_valid)} files (from {len(df_d3)})")



STEP 5: FILTER D2 & D3 BY DURATION
D2 -> Remaining 56 files (from 56)
D3 -> Remaining 175 files (from 177)


In [18]:
print("\n" + "="*60)
print("STEP 6: CREATE DATA SPLITS")
print("="*60)

# D1: Split into val (15%), temp (85%)
df_d1_train, df_d1_val = train_test_split(
    df_d1_valid,
    test_size=0.15,
    random_state=CONFIG['random_seed'],
    stratify=df_d1_valid['emotion']
)

# Combine D1_train + D2 + D3 for further split
df_combined_rest = pd.concat([df_d1_train, df_d2_valid, df_d3_valid], 
                              ignore_index=True)

# From combined, extract 20% for test (stratified)
df_combined_train, df_test = train_test_split(
    df_combined_rest,
    test_size=0.20,
    random_state=CONFIG['random_seed'],
    stratify=df_combined_rest['emotion']
)

# D1 remaining goes to phase1, rest (D2+D3) goes to phase2
df_phase1 = df_d1_train[~df_d1_train.index.isin(df_test.index)].copy()
df_phase2 = df_combined_train.copy()

print(f"\nSPLIT SUMMARY:")
print(f"Phase1 (D1 only):     {len(df_phase1)} files")
print(f"Phase2 (D1+D2+D3):    {len(df_phase2)} files")
print(f"Val (D1 only):        {len(df_d1_val)} files")
print(f"Test (D1+D2+D3):      {len(df_test)} files")
print(f"Total:                {len(df_phase1) + len(df_phase2) + len(df_d1_val) + len(df_test)} files")

print(f"\nTest distribution:\n{df_test['emotion'].value_counts()}")

# ENSURE AUDIO FILES ARE VALID 16kHz
print("\n" + "="*60)
print("STEP 7: RESAMPLE & VALIDATE ALL AUDIO TO 16kHz")
print("="*60)


def ensure_16k_wav(src_path, dst_path):
    """
    Read audio, convert to mono 16kHz wav, save to dst_path
    NO torchaudio.load -> NO torchcodec
    """
    try:
        # 1) Read audio
        audio, sr = sf.read(src_path, dtype="float32")

        # 2) Mono
        if audio.ndim > 1:
            audio = audio.mean(axis=1)

        # 3) Resample if needed
        if sr != 16000:
            audio = taF.resample(
                torch.from_numpy(audio), sr, 16000
            ).numpy()
            sr = 16000

        # 4) Save wav
        sf.write(dst_path, audio, sr)
        return True

    except Exception as e:
        print(f"Error resampling {src_path}: {e}")
        return False


STEP 6: CREATE DATA SPLITS

SPLIT SUMMARY:
Phase1 (D1 only):     3694 files
Phase2 (D1+D2+D3):    3772 files
Val (D1 only):        792 files
Test (D1+D2+D3):      944 files
Total:                9202 files

Test distribution:
emotion
happy      266
angry      262
neutral    220
sad        196
Name: count, dtype: int64

STEP 7: RESAMPLE & VALIDATE ALL AUDIO TO 16kHz


In [20]:
print("\n" + "="*60)
print("STEP 8: COPY FILES TO OUTPUT DIRECTORY")
print("="*60)

def copy_dataset_split(df, output_base, split_name):
    """Copy files from dataframe to organized folder structure"""
    print(f"\nCopying {split_name}...")

    # Tạo folder emotion
    for emotion in EMOTION_CLASSES:
        os.makedirs(os.path.join(output_base, emotion), exist_ok=True)

    # Counter riêng cho từng emotion
    emotion_counters = {emotion: 0 for emotion in EMOTION_CLASSES}

    success_count = 0
    fail_count = 0

    for _, row in tqdm(df.iterrows(), total=len(df)):
        src = row["audio_path"]
        emotion = row["emotion"]

        emotion_counters[emotion] += 1
        file_num = emotion_counters[emotion]

        # Tên file: 1.wav, 2.wav, ...
        filename = f"{file_num}.wav"
        dst = os.path.join(output_base, emotion, filename)

        if ensure_16k_wav(src, dst):
            success_count += 1
        else:
            fail_count += 1

    print(f"{split_name}: {success_count} success, {fail_count} failed")
    return success_count


# Create output structure
for phase_dir in [TRAIN_PHASE1, TRAIN_PHASE2, VAL_DIR, TEST_DIR]:
    os.makedirs(phase_dir, exist_ok=True)

# Copy each split
copy_dataset_split(df_phase1, TRAIN_PHASE1, "Phase1 (train)")
copy_dataset_split(df_phase2, TRAIN_PHASE2, "Phase2 (train)")
copy_dataset_split(df_d1_val, VAL_DIR, "Validation")
copy_dataset_split(df_test, TEST_DIR, "Test")


STEP 8: COPY FILES TO OUTPUT DIRECTORY

Copying Phase1 (train)...


100%|██████████| 3694/3694 [00:28<00:00, 131.14it/s]


Phase1 (train): 3694 success, 0 failed

Copying Phase2 (train)...


100%|██████████| 3772/3772 [00:16<00:00, 227.28it/s]


Phase2 (train): 3772 success, 0 failed

Copying Validation...


100%|██████████| 792/792 [00:09<00:00, 87.15it/s]


Validation: 792 success, 0 failed

Copying Test...


100%|██████████| 944/944 [00:04<00:00, 204.85it/s]

Test: 944 success, 0 failed





944

In [23]:


# ============================================================
# 9) VERIFY OUTPUT STRUCTURE
# ============================================================
print("\n" + "="*60)
print("STEP 9: VERIFY OUTPUT STRUCTURE")
print("="*60)

def verify_split(split_path, split_name):
    """Count files in each emotion folder"""
    print(f"\n{split_name}:")
    total = 0
    for emotion in EMOTION_CLASSES:
        emotion_dir = f"{split_path}/{emotion}"
        if os.path.exists(emotion_dir):
            count = len([f for f in os.listdir(emotion_dir) if f.endswith('.wav')])
            total += count
            print(f"  {emotion}: {count} files")
    print(f"  TOTAL: {total} files")
    return total

verify_split(TRAIN_PHASE1, "Phase1")
verify_split(TRAIN_PHASE2, "Phase2")
verify_split(VAL_DIR, "Validation")
verify_split(TEST_DIR, "Test")

# ============================================================
# 10) SUMMARY STATISTICS
# ============================================================
print("\n" + "="*60)
print("STEP 10: SUMMARY STATISTICS")
print("="*60)

summary_data = {
    'Phase1': (df_phase1, TRAIN_PHASE1),
    'Phase2': (df_phase2, TRAIN_PHASE2),
    'Val': (df_d1_val, VAL_DIR),
    'Test': (df_test, TEST_DIR)
}

for split_name, (df_split, path) in summary_data.items():
    print(f"\n{split_name}:")
    print(f"  Total files: {len(df_split)}")
    print(f"  Emotion distribution:")
    for emotion in EMOTION_CLASSES:
        count = len(df_split[df_split['emotion'] == emotion])
        pct = 100 * count / len(df_split) if len(df_split) > 0 else 0
        print(f"    {emotion}: {count} ({pct:.1f}%)")
    print(f"  Duration stats:")
    print(f"    Min: {df_split['duration'].min():.2f}s")
    print(f"    Max: {df_split['duration'].max():.2f}s")
    print(f"    Mean: {df_split['duration'].mean():.2f}s")

print("\n" + "="*60)
print("✓ DATA MERGING COMPLETE")
print("="*60)
print(f"\nOutput directory structure created at: {OUTPUT_DIR}/")
print("Ready for training!")

# Optional: Save metadata
metadata = {
    'config': CONFIG,
    'split_stats': {
        'phase1': len(df_phase1),
        'phase2': len(df_phase2),
        'val': len(df_d1_val),
        'test': len(df_test),
    },
    'emotion_classes': EMOTION_CLASSES
}

import json
with open(f"{OUTPUT_DIR}/metadata.json", 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"\nMetadata saved to {OUTPUT_DIR}/metadata.json")



STEP 9: VERIFY OUTPUT STRUCTURE

Phase1:
  neutral: 855 files
  happy: 1060 files
  sad: 749 files
  angry: 1030 files
  TOTAL: 3694 files

Phase2:
  neutral: 877 files
  happy: 1063 files
  sad: 783 files
  angry: 1049 files
  TOTAL: 3772 files

Validation:
  neutral: 184 files
  happy: 226 files
  sad: 162 files
  angry: 220 files
  TOTAL: 792 files

Test:
  neutral: 220 files
  happy: 266 files
  sad: 196 files
  angry: 262 files
  TOTAL: 944 files

STEP 10: SUMMARY STATISTICS

Phase1:
  Total files: 3694
  Emotion distribution:
    neutral: 855 (23.1%)
    happy: 1060 (28.7%)
    sad: 749 (20.3%)
    angry: 1030 (27.9%)
  Duration stats:
    Min: 1.00s
    Max: 12.86s
    Mean: 2.16s

Phase2:
  Total files: 3772
  Emotion distribution:
    neutral: 877 (23.3%)
    happy: 1063 (28.2%)
    sad: 783 (20.8%)
    angry: 1049 (27.8%)
  Duration stats:
    Min: 0.58s
    Max: 15.00s
    Mean: 2.47s

Val:
  Total files: 792
  Emotion distribution:
    neutral: 184 (23.2%)
    happy: 226 (