In [18]:
import itertools
import multiprocessing
import tempfile
import pickle
import numpy as np
import parselmouth
import soundfile as sf
from parselmouth.praat import call
from pymcd.mcd import Calculate_MCD
from tqdm.auto import tqdm
import math
from IPython.display import Audio

## Generate Synthetic Voices

### Search Space

Reference: https://voicefoundation.org/health-science/voice-disorders/anatomy-physiology-of-voice-production/understanding-voice-production/

In [56]:
num_candidates = 1000
num_steps = np.ceil(math.pow(num_candidates, 1 / 3))
formant_shift_ratio = (1.0, 1.3)
new_pitch_median = (180, 220)
pitch_range_factor = (1.0, 1.3)
x = 0

for param in ["formant_shift_ratio", "new_pitch_median", "pitch_range_factor"]:
    x = eval(param)
    x = list(np.arange(x[0], x[1], (x[1] - x[0]) / num_steps))
    exec(f"{param} = x")

In [57]:
pitch_min = [75]
pitch_max = [600]
duration_factor = [1]

In [58]:
search_space = list(
    itertools.product(
        *[
            pitch_min,
            pitch_max,
            formant_shift_ratio,
            new_pitch_median,
            pitch_range_factor,
            duration_factor,
        ]
    )
)

In [59]:
len(search_space)

1000

### Running with Multiprocess

In [60]:
def change_gender(
    sound_input: np.ndarray,
    sampling_rate: int,
    pitch_min: float,
    pitch_max: float,
    formant_shift_ratio: float,
    new_pitch_median: float,
    pitch_range_factor: float,
    duration_factor: float,
) -> np.ndarray:
    """
    Changes the gender of the input audio using Praat's 'Change gender' algorithm.

    Args:
        sound_input (np.ndarray): The input audio data as a NumPy array.
        sampling_rate (int): The sampling rate of the input audio.
        pitch_min (float): Minimum pitch (Hz) below which pitch candidates will not be considered.
        pitch_max (float): Maximum pitch (Hz) above which pitch candidates will be ignored.
        formant_shift_ratio (float): Ratio determining the frequencies of formants in the newly created audio.
            A ratio of 1.0 indicates no frequency shift, while 1.1 approximates female formant characteristics.
            A ratio of 1/1.1 approximates male formant characteristics.
        new_pitch_median (float): Median pitch (Hz) of the new audio. The pitch values in the new audio
            are calculated by multiplying them by new_pitch_median / old_pitch_median.
            Default: 0.0 (same as original).
        pitch_range_factor (float): Scaling factor for the new pitch values around the new pitch median.
            A factor of 1.0 implies no additional pitch modification (except for the median adjustment).
            A factor of 0.0 monotonizes the new sound to the new pitch median.
            Default: 1.0.
        duration_factor (float): Factor by which the sound will be lengthened.
            Values less than 1.0 result in a shorter sound, while values larger than 3.0 are not supported.
            Default: 1.0.

    Returns:
        np.ndarray: The processed audio data as a NumPy array with the gender changed.

    Raises:
        AssertionError: If pitch_min is greater than pitch_max or if duration_factor is larger than 3.0.
    """
    assert (
        pitch_min <= pitch_max
    ), "pitch_min should be less than or equal to pitch_max"
    assert duration_factor <= 3.0, "duration_factor cannot be larger than 3.0"

    # Save the input audio to a temporary file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file, sound_input, sampling_rate)

    # Load the source audio
    sound = parselmouth.Sound(tmp_file.name)

    # Tune the audio
    tuned_sound = call(
        sound,
        "Change gender",
        pitch_min,
        pitch_max,
        formant_shift_ratio,
        new_pitch_median,
        pitch_range_factor,
        duration_factor,
    )

    # Remove the temporary file
    tmp_file.close()

    return (
        np.array(tuned_sound.values.T),
        formant_shift_ratio,
        new_pitch_median,
        pitch_range_factor,
    )

In [61]:
def change_gender_wrapper(args):
    # Unpack the arguments
    sound_input, sampling_frequency, *params = args

    # Call the change_gender function with the reconstructed sound object and other parameters
    tuned_audio = change_gender(sound_input, sampling_frequency, *params)

    return tuned_audio

In [62]:
# Load the male voice recording
sound = parselmouth.Sound("0001_m.wav").resample(new_frequency=16000)

In [63]:
# Create a pool of worker processes
pool = multiprocessing.Pool(processes=max(1, multiprocessing.cpu_count() - 2))

# Prepare the arguments for the change_gender function
args = [
    (
        np.array(sound.values.T),
        int(sound.sampling_frequency),
        *params,
    )
    for params in search_space
]

# Use tqdm to track the progress of multiprocessing
results = []
with tqdm(total=len(args), desc="Processing") as pbar:
    # Use multiprocessing imap to apply the change_gender function to each combination
    for result in pool.imap(change_gender_wrapper, args):
        results.append(result)
        pbar.update()

# Close the pool of worker processes
pool.close()
pool.join()

print("Processing completed.")

Processing: 100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 45.69it/s]


Processing completed.


In [64]:
# Write the object to pickle
with open("tuned_voices.pkl", "wb") as file:
    pickle.dump(results, file)

## Measure the Fitness

In [65]:
# Instance of MCD class
# Three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
mcd_toolbox = Calculate_MCD(MCD_mode="dtw")

In [66]:
# Load tuned voices
with open("tuned_voices.pkl", "rb") as file:
    tuned_voices = pickle.load(file)

In [67]:
# Load the female voice recording
sound = parselmouth.Sound("0001_fm.wav").resample(new_frequency=16000)
sound.save("0001_fm_16khz.wav", "WAV")

In [68]:
def calculate_mcd_distance(args):
    ref_file, target_file, params = args
    mcd = mcd_toolbox.calculate_mcd(ref_file, target_file)
    return mcd, params

In [69]:
# Define the reference file
ref_file = "0001_fm_16khz.wav"

# Create a pool of worker processes
pool = multiprocessing.Pool(processes=max(1, multiprocessing.cpu_count() - 2))

# Prepare the arguments for MCD calculation
args = []
for i in range(len(tuned_voices)):
    audio, *params = tuned_voices[i]
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file.name, audio, 16000)
    args.append((ref_file, tmp_file.name, params))

# Use tqdm to track the progress of multiprocessing
results = []
with tqdm(total=len(args), desc="Processing") as pbar:
    # Use multiprocessing imap to apply the calculate_mcd_distance function to each argument
    for result in pool.imap(calculate_mcd_distance, args):
        results.append(result)
        pbar.update()

# Close the pool of worker processes
pool.close()
pool.join()

Processing: 100%|███████████████████████████████████████████████████████████████████████████| 1000/1000 [07:26<00:00,  2.24it/s]


In [70]:
# Find the minimum MCD and its corresponding parameters
min_mcd = float("inf")
best_params = None
for mcd, params in results:
    if mcd < min_mcd:
        min_mcd = mcd
        best_params = params

# Print the minimum MCD and its corresponding parameters
print("Minimum MCD: {}".format(min_mcd))
print("Best parameters: {}".format(best_params))

Minimum MCD: 12.668206483284255
Best parameters: [1.1800000000000002, 192.0, 1.06]


In [71]:
# Load the male voice recording
sound = parselmouth.Sound("0001_m.wav").resample(new_frequency=41000)
# sound = parselmouth.Sound("male_voice.mp3").resample(new_frequency=41000)

# Tune according to min MCD params
tuned_audio, *params = change_gender(
    np.array(sound.values.T), 41000, 75, 600, *best_params, 1
)

In [72]:
sf.write("tuned_voice.wav", tuned_audio, 41000)
tuned_voice = parselmouth.Sound("tuned_voice.wav")
Audio(data=tuned_voice.values, rate=tuned_voice.sampling_frequency)