In [1]:
import parselmouth
from parselmouth.praat import call
from IPython.display import Audio
from scipy.signal import savgol_filter
import librosa
import numpy as np
import soundfile as sf

In [5]:
sound = parselmouth.Sound("~/data/dnd/Session_1/c/c-clean-30.wav")

In [6]:
Audio(data=sound.values, rate=sound.sampling_frequency)

In [4]:
manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
type(manipulation)

parselmouth.Data

In [5]:
pitch_tier = call(manipulation, "Extract pitch tier")

In [6]:
type(pitch_tier)

parselmouth.Data

In [8]:
call(pitch_tier, "Multiply frequencies", sound.xmin, sound.xmax, 2)
call([pitch_tier, manipulation], "Replace pitch tier")

In [9]:
sound_octave_up = call(manipulation, "Get resynthesis (overlap-add)")

In [10]:
Audio(data=sound_octave_up.values, rate=sound_octave_up.sampling_frequency)

In [7]:
pitch = sound.to_pitch()

In [8]:
pitch_points = call(pitch_tier, "Get number of points")

In [9]:
for point in range(1, pitch_points + 1):
    time = call(pitch_tier, "Get time from index", point)
    pitch = call(pitch_tier, "Get value at index", point)
    call(pitch_tier, "Remove point", point)
    call(pitch_tier, "Add point", time, pitch - 20)

In [10]:
call([pitch_tier, manipulation], "Replace pitch tier")

In [11]:
shifted_sound = call(manipulation, "Get resynthesis (overlap-add)")

In [12]:
Audio(data=shifted_sound.values, rate=shifted_sound.sampling_frequency)

In [13]:
def shift_pitch(audio_path, shift_amount=20):
    # Load the sound file
    sound = parselmouth.Sound(audio_path)
    
    # Extract pitch
    pitch = sound.to_pitch()
    manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
    
    # Get pitch tier
    pitch_tier = call(manipulation, "Extract pitch tier")
    
    # Shift all pitch points
    pitch_points = call(pitch_tier, "Get number of points")
    for point in range(1, pitch_points + 1):
        time = call(pitch_tier, "Get time from index", point)
        pitch = call(pitch_tier, "Get value at index", point)
        call(pitch_tier, "Remove point", point)
        call(pitch_tier, "Add point", time, pitch + shift_amount)
    
    # Replace pitch tier in manipulation
    call([pitch_tier, manipulation], "Replace pitch tier")
    
    # Convert back to sound
    shifted_sound = call(manipulation, "Get resynthesis (overlap-add)")
    return shifted_sound

# Usage example:
# shifted = shift_pitch("input.wav")
# shifted.save("output_shifted.wav")

In [2]:
def shift_pitch_f0(audtio_path, shift_amount=120):
    y, sr = librosa.load(audtio_path)

    # extract f0
    f0, voiced_flag, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))

    # Calculate mean f0 of voiced segments
    mean_f0 = np.mean(f0[voiced_flag])

    # calculate pitch shift in semitones
    n_steps = 12 * np.log2(1 + shift_amount / mean_f0)

    # apply pitch shift
    shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
    return shifted, sr

In [7]:
shifted_audio, sr = shift_pitch_f0("/Users/ojas/projects/data/Session_1/c/c-clean-30.wav", shift_amount=-20)

In [8]:
Audio(data=shifted_audio, rate=sr)

In [4]:
def f0_limited(audio_path, fmin_hz=65, fmax_hz=400, smoothing_window=51, polyorder=3):
    y, sr = librosa.load(audio_path)

    # extract f0
    f0, voiced_flag, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))

    # clip f0 values to the desired range
    f0_clipped = np.clip(f0, fmin_hz, fmax_hz)

    # smooth the f0 contour
    f0_smoothed = savgol_filter(f0_clipped, smoothing_window, polyorder)

    return y, sr, f0_smoothed, voiced_flag

In [None]:
def limit_f0_variability(audio_path, max_deviation_hz=50, smoothing_window=51, polyorder=3):
    """
    Limit the f0 variability in speech to stay within a max deviation from the mean f0.
    
    Args:
        audio_path: Path to audio file
        max_deviation_hz: Maximum allowed deviation from mean f0 in Hz
        smoothing_window: Window size for smoothing (must be odd)
        polyorder: Order of polynomial for smoothing
    """
    # Get f0 contour using existing function
    y, sr, f0 = f0_limited(audio_path, smoothing_window=smoothing_window, polyorder=polyorder)
    
    # Find voiced frames
    voiced_mask = f0 > 0
    
    # Calculate mean f0 of voiced segments
    mean_f0 = np.mean(f0[voiced_mask])
    
    # Define allowed range
    min_f0 = mean_f0 - max_deviation_hz
    max_f0 = mean_f0 + max_deviation_hz
    
    # Limit f0 range
    f0_limited = np.clip(f0, min_f0, max_f0)
    
    # Calculate pitch shift ratio at each point
    shift_ratio = np.ones_like(f0)
    shift_ratio[voiced_mask] = f0_limited[voiced_mask] / f0[voiced_mask]
    
    # Apply time-varying pitch shift
    frames = librosa.util.frame(y, frame_length=2048, hop_length=512)
    shifted_frames = np.zeros_like(frames)
    
    for i in range(frames.shape[1]):
        frame = frames[:, i]
        ratio = shift_ratio[i] if i < len(shift_ratio) else 1.0
        if not np.isnan(ratio):
            n_steps = 12 * np.log2(ratio)
            shifted_frames[:, i] = librosa.effects.pitch_shift(
                frame, sr=sr, n_steps=float(n_steps)
            )
    
    # Reconstruct audio
    shifted_audio = librosa.util.fix_length(
        librosa.overlap_add(shifted_frames, 512), size=len(y)
    )
    
    return shifted_audio, sr