# Optional enhancements

In [18]:
from pydub import AudioSegment
import os
import pandas as pd

# Pitch shift

In [19]:
def apply_pitch_shift_file(input_path: str, output_path: str, semitones: float):
    """
    Apply pitch shift to a file and save it.
    
    Args:
        input_path (str): Path to the input WAV/MP3 file.
        output_path (str): Path where the new file will be saved.
        semitones (float): Pitch shift amount (+2 = higher, -2 = lower).
    """
    audio = AudioSegment.from_file(input_path).set_channels(1)
    new_sample_rate = int(audio.frame_rate * (2.0 ** (semitones / 12.0)))
    shifted = audio._spawn(audio.raw_data, overrides={'frame_rate': new_sample_rate})
    shifted = shifted.set_frame_rate(audio.frame_rate)
    shifted.export(output_path, format="wav")


In [20]:
input_file = "processed_audio/en/England_English/common_voice_en_42699890.wav"
output_pitch_up = input_file.replace(".wav", "_pitchup.wav")
output_pitch_down = input_file.replace(".wav", "_pitchdown.wav")

apply_pitch_shift_file(input_file, output_pitch_up, semitones=2)
apply_pitch_shift_file(input_file, output_pitch_down, semitones=-2)

# Speed

In [21]:
def change_audio_speed(input_path, output_path, speed=1.1):
    """
    Change the speed of an audio file using pydub.
    
    Parameters:
        input_path (str): Path to the input WAV file.
        output_path (str): Path to save the speed-modified WAV.
        speed (float): Speed multiplier (e.g., 1.1 = 10% faster, 0.9 = 10% slower)
    """
    try:
        sound = AudioSegment.from_file(input_path)
        new_frame_rate = int(sound.frame_rate * speed)

        sped_up = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
        sped_up = sped_up.set_frame_rate(sound.frame_rate)

        sped_up.export(output_path, format="wav")
        print(f"Saved: {output_path}")
    except Exception as e:
        print(f"Failed to change speed for {input_path}: {e}")


In [22]:
input_file = "processed_audio/en/England_English/common_voice_en_42699890.wav"
output_file_fast = input_file.replace(".wav", "_speedup.wav")
output_file_slow = input_file.replace(".wav", "_slowdown.wav")

change_audio_speed(input_file, output_file_fast, speed=1.5)   # 10% faster
change_audio_speed(input_file, output_file_slow, speed=0.7)   # 10% slower


Saved: processed_audio/en/England_English/common_voice_en_42699890_speedup.wav
Saved: processed_audio/en/England_English/common_voice_en_42699890_slowdown.wav


# Backgorund noise

In [23]:
def add_background_noise_file(input_path: str, output_path: str, noise_path: str, snr_db: float = 10.0):
    """
    Overlay background noise on audio file and save it.

    Args:
        input_path (str): Original clean audio.
        output_path (str): Destination for noisy version.
        noise_path (str): Path to a noise clip (e.g., cafe, traffic).
        snr_db (float): Desired signal-to-noise ratio in dB.
    """
    audio = AudioSegment.from_file(input_path).set_channels(1)
    noise = AudioSegment.from_file(noise_path).set_channels(1).set_frame_rate(audio.frame_rate)

    # Loop noise if needed
    if len(noise) < len(audio):
        noise = (noise * (len(audio) // len(noise) + 1))[:len(audio)]
    else:
        noise = noise[:len(audio)]

    # Adjust volume of noise to desired SNR
    signal_power = audio.dBFS
    noise_power = signal_power - snr_db
    noise = noise - (noise.dBFS - noise_power)

    mixed = audio.overlay(noise)
    mixed.export(output_path, format="wav")


In [24]:
input_file = "processed_audio/en/England_English/common_voice_en_42699890.wav"
noise_file = "data/mixkit-classic-alarm-995.wav"  
output_noisy = input_file.replace(".wav", "_noisy.wav")

add_background_noise_file(input_file, output_noisy, noise_file, snr_db=10)

In [31]:
from mutagen.wave import WAVE

audio = WAVE(input_file)
print(audio.pprint())


1 channel RIFF @ 512000 bps, 16000 Hz, 4.79 seconds (audio/wav)


In [25]:
df_all = pd.read_csv("final_manifest.csv")
dialect_counts = df_all["accents"].value_counts().reset_index()
dialect_counts.columns = ["dialect", "sample_count"]
print(dialect_counts)


                                             dialect  sample_count
0                                Deutschland Deutsch            94
1                                 Français de France            77
2                           Österreichisches Deutsch            57
3                                    England English            36
4            Français de France,Français de Belgique            24
5  Français de France,Accent du Sud Ouest, accent...            20
6                                   Schweizerdeutsch            14
7  Southern African (South Africa, Zimbabwe, Nami...             9
8                                 Australian English             9


In [17]:
duration_summary = df_all.groupby("accents")["duration_ms"].agg(
    clips="count",
    total_ms="sum",
    avg_ms="mean"
).reset_index()

duration_summary["total_sec"] = (duration_summary["total_ms"] / 1000).round(1)
duration_summary["avg_sec"] = (duration_summary["avg_ms"] / 1000).round(1)
duration_summary = duration_summary.sort_values("total_sec")
duration_summary


Unnamed: 0,accents,clips,total_ms,avg_ms,total_sec,avg_sec
7,"Southern African (South Africa, Zimbabwe, Nami...",9,44388,4932.0,44.4,4.9
0,Australian English,9,51876,5764.0,51.9,5.8
6,Schweizerdeutsch,14,82656,5904.0,82.7,5.9
4,"Français de France,Accent du Sud Ouest, accent...",20,92196,4609.8,92.2,4.6
5,"Français de France,Français de Belgique",24,124236,5176.5,124.2,5.2
2,England English,36,222408,6178.0,222.4,6.2
3,Français de France,77,295632,3839.376623,295.6,3.8
8,Österreichisches Deutsch,57,296424,5200.421053,296.4,5.2
1,Deutschland Deutsch,94,297900,3169.148936,297.9,3.2


In [None]:
# won't do balancing because we have to low number of samples, and it would end up in undersampling or oversampling