In [1]:
import itertools
import multiprocessing
import tempfile

import numpy as np
import parselmouth
import soundfile as sf
from parselmouth.praat import call
from pymcd.mcd import Calculate_MCD

In [2]:
# Instance of MCD class
# Three different modes "plain", "dtw" and "dtw_sl" for the above three MCD metrics
mcd_toolbox = Calculate_MCD(MCD_mode="dtw")

## Generate Synthetic Voices

### Search Space

Reference: https://voicefoundation.org/health-science/voice-disorders/anatomy-physiology-of-voice-production/understanding-voice-production/

In [3]:
pitch_min = [75]
pitch_max = [600]
formant_shift_ratio = list(np.arange(1, 1.5, 0.01))
new_pitch_median = list(np.arange(180, 220, 1.0))
pitch_range_factor = list(np.arange(1, 1.5, 0.01))
duration_factor = [1]

In [4]:
search_space = list(
    itertools.product(
        *[
            pitch_min,
            pitch_max,
            formant_shift_ratio,
            new_pitch_median,
            pitch_range_factor,
            duration_factor,
        ]
    )
)

### Running with Multiprocess

In [5]:
def change_gender(
    sound_input: np.ndarray,
    sampling_rate: int,
    pitch_min: float,
    pitch_max: float,
    formant_shift_ratio: float,
    new_pitch_median: float,
    pitch_range_factor: float,
    duration_factor: float,
) -> np.ndarray:
    """
    Changes the gender of the input audio using Praat's 'Change gender' algorithm.

    Args:
        sound_input (np.ndarray): The input audio data as a NumPy array.
        sampling_rate (int): The sampling rate of the input audio.
        pitch_min (float): Minimum pitch (Hz) below which pitch candidates will not be considered.
        pitch_max (float): Maximum pitch (Hz) above which pitch candidates will be ignored.
        formant_shift_ratio (float): Ratio determining the frequencies of formants in the newly created audio.
            A ratio of 1.0 indicates no frequency shift, while 1.1 approximates female formant characteristics.
            A ratio of 1/1.1 approximates male formant characteristics.
        new_pitch_median (float): Median pitch (Hz) of the new audio. The pitch values in the new audio
            are calculated by multiplying them by new_pitch_median / old_pitch_median.
            Default: 0.0 (same as original).
        pitch_range_factor (float): Scaling factor for the new pitch values around the new pitch median.
            A factor of 1.0 implies no additional pitch modification (except for the median adjustment).
            A factor of 0.0 monotonizes the new sound to the new pitch median.
            Default: 1.0.
        duration_factor (float): Factor by which the sound will be lengthened.
            Values less than 1.0 result in a shorter sound, while values larger than 3.0 are not supported.
            Default: 1.0.

    Returns:
        np.ndarray: The processed audio data as a NumPy array with the gender changed.

    Raises:
        AssertionError: If pitch_min is greater than pitch_max or if duration_factor is larger than 3.0.
    """
    assert (
        pitch_min <= pitch_max
    ), "pitch_min should be less than or equal to pitch_max"
    assert duration_factor <= 3.0, "duration_factor cannot be larger than 3.0"

    # Save the input audio to a temporary file
    tmp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp_file, sound_input, sampling_rate)

    # Load the source audio
    sound = parselmouth.Sound(tmp_file.name)

    # Tune the audio
    tuned_sound = call(
        sound,
        "Change gender",
        pitch_min,
        pitch_max,
        formant_shift_ratio,
        new_pitch_median,
        pitch_range_factor,
        duration_factor,
    )

    # Remove the temporary file
    tmp_file.close()

    return (
        np.array(tuned_sound.values.T),
        formant_shift_ratio,
        new_pitch_median,
        pitch_range_factor,
    )

In [6]:
def change_gender_wrapper(args):
    # Unpack the arguments
    sound_input, sampling_frequency, *params = args

    # Call the change_gender function with the reconstructed sound object and other parameters
    tuned_audio = change_gender(sound_input, sampling_frequency, *params)

    return tuned_audio

In [7]:
# Load the male voice recording
sound = parselmouth.Sound("0001_m.wav").resample(new_frequency=16000)

In [8]:
# Create a pool of worker processes
pool = multiprocessing.Pool(processes=max(1, multiprocessing.cpu_count() - 2))

# Prepare the arguments for the change_gender function
args = [
    (
        np.array(sound.values.T),
        int(sound.sampling_frequency),
        *params,
    )
    for params in search_space[:40]
]

# Use multiprocessing to apply the change_gender function to each combination
results = pool.map(change_gender_wrapper, args)

# Close the pool of worker processes
pool.close()
pool.join()

In [9]:
results[0]

(array([[-1.74877368e-08],
        [-1.82540954e-07],
        [-4.36872653e-07],
        ...,
        [ 0.00000000e+00],
        [ 0.00000000e+00],
        [ 0.00000000e+00]]),
 1.0,
 180.0,
 1.0)