Skip to content

Commit

Permalink
NVIDIA#1369 Updated DeepLearningExamples so that all projects can use…
Browse files Browse the repository at this point in the history
… latest librosa.

	modified:   CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py
	modified:   CUDA-Optimized/FastSpeech/generate.py
	modified:   CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py
	modified:   CUDA-Optimized/FastSpeech/tacotron2/layers.py
	modified:   Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb
	modified:   Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb
	modified:   PyTorch/SpeechRecognition/Jasper/requirements.txt
	modified:   PyTorch/SpeechRecognition/QuartzNet/requirements.txt
	modified:   PyTorch/SpeechRecognition/wav2vec2/requirements.txt
	modified:   PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py
	modified:   PyTorch/SpeechSynthesis/FastPitch/requirements.txt
	modified:   PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt
	modified:   PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py
	modified:   PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt
  • Loading branch information
xvdp committed Jan 16, 2024
1 parent 9dd9fcb commit 400f37f
Show file tree
Hide file tree
Showing 14 changed files with 17 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __getitem__(self, idx):

# Audio processing
wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len)

if self.mels_path:
mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
else:
Expand Down
4 changes: 2 additions & 2 deletions CUDA-Optimized/FastSpeech/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import time

import fire
import librosa
import soundfile
import torch

from fastspeech.data_load import PadDataLoader
Expand Down Expand Up @@ -158,7 +158,7 @@ def generate(hparam='infer.yaml',
wav = wav[:wav_len]

path = os.path.join(results_path, text[:MAX_FILESIZE] + ".wav")
librosa.output.write_wav(path, wav, hp.sr)
soundfile.write(path, wav, hp.sr)

except StopIteration:
tprint("Generation has been done.")
Expand Down
2 changes: 1 addition & 1 deletion CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
# Compute the squared window at the desired length
win_sq = get_window(window, win_length, fftbins=True)
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
win_sq = librosa_util.pad_center(win_sq, n_fft)
win_sq = librosa_util.pad_center(win_sq, size=n_fft)

# Fill the envelope
for i in range(n_frames):
Expand Down
1 change: 0 additions & 1 deletion CUDA-Optimized/FastSpeech/tacotron2/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
"""https://github.com/NVIDIA/tacotron2"""

import torch
from librosa.filters import mel as librosa_mel_fn


class LinearNorm(torch.nn.Module):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,10 @@
" \"\"\"\n",
" samples = self._convert_samples_to_float32(samples)\n",
" if target_sr is not None and target_sr != sample_rate:\n",
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
" sample_rate = target_sr\n",
" if trim:\n",
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
" self._samples = samples\n",
" self._sample_rate = sample_rate\n",
" if self._samples.ndim >= 2:\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,10 @@
" \"\"\"\n",
" samples = self._convert_samples_to_float32(samples)\n",
" if target_sr is not None and target_sr != sample_rate:\n",
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
" sample_rate = target_sr\n",
" if trim:\n",
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
" self._samples = samples\n",
" self._sample_rate = sample_rate\n",
" if self._samples.ndim >= 2:\n",
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/Jasper/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
inflect==5.3.0
ipdb
librosa==0.9.0
librosa>=0.9.0
pandas==1.5.2
pyyaml>=5.4
soundfile
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/QuartzNet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
inflect==5.3.0
ipdb
librosa==0.9.0
librosa>=0.9.0
pandas==1.5.2
pyyaml>=5.4
soundfile
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechRecognition/wav2vec2/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
editdistance==0.6.0
librosa==0.10.1
librosa>=0.10.1
omegaconf==2.0.6 # optional for handling certain Fairseq ckpts
pyarrow==6.0.1
soundfile==0.12.1
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
global mel_basis, hann_window
fmax_key = f'{fmax}_{y.device}'
if fmax_key not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device)
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/FastPitch/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
inflect
librosa==0.9.0
librosa>=0.9.0
matplotlib
numpy
pynvml==11.0.0
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
inflect
librosa==0.9.0
librosa>=0.9.0
numpy
pandas
pynvml==11.0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -383,10 +383,10 @@ def __init__(self, samples, sample_rate, target_sr=16000, trim=False,
"""
samples = self._convert_samples_to_float32(samples)
if target_sr is not None and target_sr != sample_rate:
samples = librosa.core.resample(samples, sample_rate, target_sr)
samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
sample_rate = target_sr
if trim:
samples, _ = librosa.effects.trim(samples, trim_db)
samples, _ = librosa.effects.trim(samples, top_db=trim_db)
self._samples = samples
self._sample_rate = sample_rate
if self._samples.ndim >= 2:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
torch==1.3.0
onnx==1.5.0
scipy==1.3.1
librosa==0.7.0
librosa

0 comments on commit 400f37f

Please sign in to comment.