In [2]:
import numpy as np
import tensorflow as tf
from glob import glob
import IPython
import librosa
import matplotlib.pyplot as plt
import torchaudio
from os import listdir
from os.path import isfile, join
import re

from utils import f0, extract_formants

# Generate spectrograms

In [3]:
hop=256               #hop size (window size = 4*hop)
sr=16000             #sampling rate
n_mels= hop
n_stft= (513-1)*2
min_level_db=-100     #reference values to normalize data
ref_level_db=20


shape=128           #length of time axis of split specrograms         
spec_split=1

In [4]:
import torch
from torchaudio.transforms import Spectrogram

torch.set_default_tensor_type('torch.FloatTensor')

specobj = Spectrogram(n_fft=4*hop, win_length=4*hop, hop_length=hop, pad=0, power=2, normalized=False)
specfunc = specobj.forward

def specfunc(waveform):
  specgram = specfunc(waveform)
  return specgram

def normalize(S):
  return np.clip((((S - min_level_db) / -min_level_db)*2.)-1., -1, 1)

def prep(wv, hop=192):
  S = np.array(torch.squeeze(specfunc(torch.Tensor(wv).view(1,-1))).detach().cpu())
  S = librosa.power_to_db(S)-ref_level_db
  return normalize(S)


In [9]:
#Generate spectrograms from waveform array
def tospec(data):
  spectro = []
  for awv in data:
    spec = prep(awv)
    spectro.append(spec)
  return np.array(spectro, dtype=np.float32)
    

## Waveform array from path of folder containing wav files
# def audio_array(path, gender):
#   ls = glob(f'{path}/*.wav')
#   adata = []
#   ascr = []
#   aftr = []

#   for i in range(len(ls)):
#     x, sr = tf.audio.decode_wav(tf.io.read_file(ls[i]), 1)
#     x = np.array(x, dtype=np.float32)

#     time = 1
#     length = len(x)/sr

#     while time < length:
#       sample = x[(time-1)*sr:time*sr]
#       src = f0(sample, gender)
#       ftr = extract_formants(sample, gender)
#       adata.append(sample)
#       ascr.append(src)
#       ftr.append(ftr)
#       time += 1

#   return np.array(adata), np.array(ascr), np.array(aftr)

def single_split(audio, from_min, to_max):
    t1 = from_min * 1000
    t2 = to_max * 1000
    split_audio = audio[t1:t2]
    return split_audio


def audio_array(path, gender, split_interval=1):
    adata = []
    ascr = []
    aftr = []

    for f in listdir(path):
        if isfile(join(path, f)) and re.match(r'(.)*\.wav\b', f):
            audio = AudioSegment.from_wav(f)
            total_sec = math.floor(audio.duration_seconds)
            
            for i in range(0, total_sec, split_interval):
                oneSecAudio = single_split(audio, i, i+split_interval)
                sound = parselmouth.Sound(oneSecAudio) # Transform the file into a parselmouth object sound
                adata.append(oneSecAudio) 
                ascr.append(f0(sound, gender))
                aftr.append(extract_formants(sound, gender))

    return np.array(adata), np.array(ascr), np.array(aftr) 

In [8]:
gender = ['male', 'female']

for g in gender:
  audio_directory = './Dataset/wav/' + g
  array_file = './Dataset/array/' + g

  awv, source, formant = audio_array(audio_directory, g)
  aspec = tospec(awv)

  print(aspec.shape)

  np.save(array_file + '_spec', aspec)
  np.save(array_file + '_f0', source)
  np.save(array_file + '_frt', formant)

PraatError: To analyse this Sound, “minimum pitch” must not be less than 132300 Hz.
Sound "untitled": pitch analysis not performed.

# Compare results

In [None]:
from utils import print_stats, plot_waveform, plot_specgram_from_wave, plot_specgram
import torchaudio

SAMPLE_WAV_SPEECH_PATH = "./Dataset/wav/female/arctic_a0001.wav"


waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH)
print(np.shape(waveform))
plt.clf()
print_stats(waveform, sample_rate=sample_rate)
plot_waveform(waveform, sample_rate)
plot_specgram_from_wave(waveform, sample_rate)

In [None]:
import librosa
import soundfile as sf


def to_audio(x):
  audio_signal = librosa.core.spectrum.griffinlim(abs_spectrogram)
  return audio_signal

""" Spectrogram transformation without normalisation """
sig, fs = librosa.core.load("./Dataset/wav/male/arctic_a0001.wav", sr=16000)
abs_spectrogram = np.abs(librosa.core.spectrum.stft(sig))
plot_specgram(abs_spectrogram, 16000)
audio_signal = librosa.core.spectrum.griffinlim(abs_spectrogram)
sf.write('test_normal.wav', data=audio_signal, samplerate=16000)

audio_signal = awv[0]
sf.write('test_sec_cut.wav', data=audio_signal, samplerate=16000)



plot_specgram(aspec[0], sr)
audio_signal = librosa.core.spectrum.griffinlim(aspec[0])
sf.write('test_normalised.wav', data=audio_signal, samplerate=16000)