In [1]:
import sys
sys.path.append("..")

from glob import glob
import matplotlib.pyplot as plt
import ipywidgets as ipw
from IPython.display import Audio
import numpy as np 
import pickle

from imitative_agent import ImitativeAgent
from lib.dataset_wrapper import Dataset
from lib.notebooks import show_ema
from external import lpcynet

current path: /mnt/c/Users/vpaul/OneDrive - CentraleSupelec/Inner_Speech/agent/imitative_agent


In [2]:
###############
# Test LPCNET #
###############
from scipy.io import wavfile

# Read input wav file
sampling_rate, pcm = wavfile.read("../external/lpcynet/item_0000.wav")
# Verify wav file compatibility with LPCNet requirements:
# - 16kHz sampling rate
# - 16-bit integer PCM format
assert sampling_rate == 16000 and pcm.dtype == "int16"

# Extract LPCNet features from audio
# Returns a float32 numpy array of shape (frame_number, features_dimension)
# features_dimension = 20, with the first 18 numbers representing the ceptrum
# and the last 2 representing respectively the period and the correlation parameters
lpcnet_features = lpcynet.analyze_frames(pcm)

# Resynthesize audio from LPCNet features
resynthesized_pcm = lpcynet.synthesize_frames(lpcnet_features)

# Save resynthesized audio to wav file
# Using same sampling rate (16kHz) as input
wavfile.write("resynth3.wav", 16000, resynthesized_pcm)

In [3]:
from glob import glob
from scipy.io.wavfile import write
from imitative_agent import ImitativeAgent
from lib.dataset_wrapper import Dataset
from lib.notebooks import show_ema
from external import lpcynet
agents_path = glob("../out/imitative_agent1_0jerk/*/") # Path to imitative agent directory
agents_path.sort()
agents_path=[agents_path[0]] # Select agent to save repetitions for
print(agents_path)

['../out/imitative_agent1_0jerk/8b2217cf868c4fefa341392f258fd81a-0/']


In [5]:
#############################################
# Save repetition results for whole dataset #
#############################################

# Get path to first agent
agent_path = agents_path[0]

# Load the agent from saved checkpoint
agent = ImitativeAgent.reload(agent_path)

# Get configuration parameters from agent
sound_type = agent.synthesizer.config["dataset"]["sound_type"]
art_type = agent.synthesizer.config["dataset"]["art_type"]
synth_dataset = agent.synthesizer.dataset

# Set dataset parameters
dataset_name = "pb2007"  # Using PB2007 can be changed to ["pb2007", "pb2007_speedx2","msak0", "fsew0"]
sound_type = "cepstrum"  # Using cepstrum features for sound representation
art_type = "art_params"  # Using articulatory parameters
dataset = Dataset(dataset_name)

# Load sound features without cutting silences
items_cepstrum = dataset.get_items_data(sound_type, cut_silences=False)
items_source = dataset.get_items_data("source", cut_silences=False)  # Get source features for LPCNet
sampling_rate = dataset.features_config["wav_sampling_rate"]
        
# Get list of items to process
items_name = dataset.get_items_list()

# Process each item in dataset
for item_name in items_name : 
        item_name = item_name[1]
        # Get cepstrum and source features for current item
        item_cepstrum = items_cepstrum[item_name]
        item_source = items_source[item_name]
        item_wave = dataset.get_item_wave(item_name)
        nb_frames = len(item_cepstrum)
        
        # Get agent's repetition of the item
        # This uses inverse model to estimate articulatory params
        repetition = agent.repeat(item_cepstrum)
        repeated_cepstrum = repetition["sound_repeated"]  # Sound from synthesizer
        estimated_cepstrum = repetition["sound_estimated"]  # Sound from direct model
        estimated_art = repetition["art_estimated"]  # Estimated articulatory parameters
        
        # Combine cepstrum with source features for LPCNet synthesis
        repeated_sound = np.concatenate((repeated_cepstrum, item_source), axis=1)
        estimated_sound = np.concatenate((estimated_cepstrum, item_source), axis=1)

        # Synthesize waveforms using LPCNet
        repeated_wave = lpcynet.synthesize_frames(repeated_sound)
        estimated_wave = lpcynet.synthesize_frames(estimated_sound)

        print("Original sound:")
        # display(Audio(item_wave, rate=sampling_rate))  # Commented out audio display
        print("Repetition (Inverse model → Synthesizer → LPCNet):")
        # display(Audio(repeated_wave, rate=sampling_rate))
        
        # Save repeated sound (inverse model → synthesizer path)
        write(f'../datasets/imitative_agent_0jerk/repeated/{dataset_name}/{item_name}.wav',sampling_rate,repeated_wave)
        
        # Save estimated sound (inverse model → direct model path) 
        write(f'../datasets/imitative_agent_0jerk/estimated/{dataset_name}/{item_name}.wav',sampling_rate,estimated_wave)

{'dataset': {'batch_size': 8, 'datasplits_size': [64, 16, 20], 'names': ['pb2007'], 'num_workers': 6, 'shuffle_between_epochs': True, 'sound_type': 'cepstrum'}, 'model': {'direct_model': {'activation': 'relu', 'batch_norm': True, 'dropout_p': 0.25, 'hidden_layers': [256, 256, 256, 256]}, 'inverse_model': {'bidirectional': True, 'dropout_p': 0.25, 'hidden_size': 32, 'num_layers': 2}}, 'synthesizer': {'name': 'ea587b76c95fecef01cfd16c7f5f289d-0/'}, 'training': {'jerk_loss_ceil': 0, 'jerk_loss_weight': 0, 'learning_rate': 0.001, 'max_epochs': 500, 'patience': 25}}
{'direct_model': {'activation': 'relu', 'batch_norm': True, 'dropout_p': 0.25, 'hidden_layers': [256, 256, 256, 256]}, 'inverse_model': {'bidirectional': True, 'dropout_p': 0.25, 'hidden_size': 32, 'num_layers': 2}}


KeyboardInterrupt: 