In [1]:
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
wav_paths = ['./bvc/one_sentence/one_sentence', './bvc/multiple_sentences/multiple_sentences']
channels, rates, samples = [], [], []
for wav_path in wav_paths:
    for wav in os.listdir(wav_path):
        sample, rate = torchaudio.load(os.path.join(wav_path, wav))
        rates.append(int(rate))
        channels.append(int(sample.shape[0]))
        samples.append(int(sample.shape[1]))
df = pd.DataFrame({
    'rates': rates,
    'channels': channels,
    'samples': samples
    })
df

Unnamed: 0,rates,channels,samples
0,48000,2,87891
1,48000,2,160811
2,48000,2,76330
3,48000,2,98474
4,48000,2,78471
...,...,...,...
2444,48000,2,57039
2445,48000,2,53383
2446,48000,2,97172
2447,48000,2,78103


In [17]:
df.rates.value_counts(), df.channels.value_counts()

(48000    2396
 44100      53
 Name: rates, dtype: int64,
 2    2449
 Name: channels, dtype: int64)

In [13]:
df.samples.mean()/df.rates.mean(), df.samples[df.rates == 48000].mean() / df.rates[df.rates == 48000].mean()

(1.8239356838322456, 1.7979328655397886)

In [5]:
def resample(sig, sr, new_sr):
    if (sr == new_sr):
        return sig, sr
    resampled_ch1 = torchaudio.transforms.Resample(rate, new_sr)(sig[:1,:])
    resampled_ch2 = torchaudio.transforms.Resample(rate, new_sr)(sig[1:,:])
    resampled_sig = torch.cat([resampled_ch1, resampled_ch2])
    return resampled_sig, new_sr

In [6]:
def resize(sig, sr, max_ms):
    num_ch, sig_len = sig.shape
    max_len = sr // 1000 * max_ms
    if sig_len > max_len:
        sig = sig[:,:max_len]
    elif sig_len < max_len:
        pad_begin_len = np.random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
    pad_begin = torch.zeros((num_ch, pad_begin_len))
    pad_end = torch.zeros((num_ch, pad_end_len))
    sig = torch.cat((pad_begin, sig, pad_end), 1)
    return sig, sr

In [7]:
def time_shift(sig, sr, shift_limit):
    sig_len = sig.shape[1]
    shift_amt = int(np.random.random() * shift_limit * sig_len)
    return sig.roll(shift_amt), sr

In [8]:
def mel_spectrogram(sig, sr, n_mels=64, n_fft=1024, hop_len=None):
    top_db = 80
    # shape [channel, n_mels, time]
    spec = torchaudio.transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    spec = torchaudio.transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)