In [46]:
import numpy as np
import tensorflow as tf
from glob import glob
import IPython
import time
import librosa
import os
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import torchaudio

from os import listdir
from os.path import isfile, join
import re
from pydub import AudioSegment
import math

import parselmouth 
from parselmouth import praat

from utils import f0, extract_formants

In [47]:
hop=256               #hop size (window size = 4*hop)
sr=16000             #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20


shape=128           #length of time axis of split specrograms         
spec_split=1

In [48]:
import torch
from torchaudio.transforms import MelScale, Spectrogram

torch.set_default_tensor_type('torch.FloatTensor')

specobj = Spectrogram(n_fft=4*hop, win_length=4*hop, hop_length=hop, pad=0, power=2, normalized=False)
specfunc = specobj.forward

def melspecfunc(waveform):
  specgram = specfunc(waveform)
  #mel_specgram = melfunc(specgram)
  #return mel_specgram
  return specgram

def normalize(S):
  return np.clip((((S - min_level_db) / -min_level_db)*2.)-1., -1, 1)

def prep(wv, hop=192):
  S = np.array(torch.squeeze(melspecfunc(torch.Tensor(wv).view(1,-1))).detach().cpu())
  S = librosa.power_to_db(S)-ref_level_db
  return normalize(S)

In [49]:
## Split audios into chunks of appropriate duration 
class SplitWavAudio():
    '''
    Split sounds into smaller chunks
    '''
    def __init__(self, folder, filename):
        self.folder = folder
        self.new_folder = folder + '_cut'
        self.filename = filename
        self.filepath = folder + '/' + filename
        
        self.audio = AudioSegment.from_wav(self.filepath)
    
    def get_duration(self):
        return self.audio.duration_seconds
    
    def single_split(self, from_min, to_max, split_filename):
        t1 = from_min * 1000
        t2 = to_max * 1000
        split_audio = self.audio[t1:t2]
        split_audio.export(self.new_folder + '/' + split_filename, format="wav")
        
    def multiple_split(self, split_interval):
        total_sec = math.floor(self.get_duration())
        for i in range(0, total_sec, split_interval):
            split_fn = str(i) + '_' + self.filename
            self.single_split(i, i+split_interval, split_fn)
            print(str(i) + ' Done')
            if i == total_sec - split_interval:
                print('All splited successfully')


def folder_split(folder, split_interval=1):
    '''
    Apply SplitWavAudio methods to a directory's files
    '''
    for f in listdir(folder):
        if isfile(join(folder, f)) and re.match(r'(.)*\.wav\b', f):
            split_wav = SplitWavAudio(folder, f)
            split_wav.multiple_split(split_interval)  

In [50]:
gender = ['male', 'female']

for g in gender:
    repository = 'Dataset/wav/' + g
    chunks_repository = repository + '_cut'
    if not os.path.exists(chunks_repository):
        os.makedirs(chunks_repository)
        folder_split(repository , split_interval=1)

In [51]:
def spec_array(path, gender, array_file):
    adata = []
    ascr = []
    aftr = []

    for f in listdir(path):
        audio_path = join(path, f)
        i = 0
        nb_points = 400
        if isfile(audio_path) and re.match(r'(.)*\.wav\b', f):
            i += 1
            awv, sr = tf.audio.decode_wav(tf.io.read_file(audio_path), 1)
            awv = np.array(awv, dtype=np.float32)
            spec = prep(awv)
            adata.append(spec)

            sound = parselmouth.Sound(audio_path) # Transform  the file into a parselmouth object sound
            src = f0(sound, gender)
            frt, nb = extract_formants(sound, gender)
            # frt = np.nan_to_num(frt, copy=False, nan=0.0)
            ascr.append(src)
            aftr.append(frt)   

            if nb >  nb_points:
                nb_points = nb 

    aftr = pad_formants(aftr, nb_points)
    
    adata, ascr, aftr = np.array(adata), np.array(ascr), np.array(aftr)

    np.save(array_file + '_spec', adata)
    np.save(array_file + '_f0', ascr)
    np.save(array_file + '_frt', aftr)

    print(adata.shape)


def reduce_formants(formant, nb_points):
    reduce_frt = []
    for arr in formant:
        reduced = arr[:nb_points]
        reduced = np.nan_to_num(reduced, copy=False, nan=0.0)
        reduce_frt.append(reduced)
    return np.array(reduce_frt)

def pad_formants(formant, nb_points):
    padded_frt = []
    for arr in formant:
        padded = np.pad(arr, (0, 438-len(arr)), 'constant')
        padded = np.nan_to_num(padded, copy=False, nan=0.0)
        padded_frt.append(padded)
    return np.array(padded_frt)

In [4]:
import numpy as np
  
  
arr = [1, 3, 9, 5, 4]
  
# padding array using 'maximum' mode
pad_arr = np.pad(arr, (0,5), 'constant')
  
print(pad_arr)

[1 3 9 5 4 0 0 0 0 0]


In [52]:
gender = ['male', 'female']

g = gender[0]
audio_directory = './Dataset/wav/' + g + '_cut'
array_file = './Dataset/array/' + g
spec_array(audio_directory, gender, array_file)

# for g in gender:
#     audio_directory = './Dataset/wav/' + g + '_cut'
#     array_file = './Dataset/array/' + g
#     spec_array(audio_directory, gender, array_file)

  adata, ascr, aftr = np.array(adata), np.array(ascr), np.array(aftr)


(2489, 513, 63)


In [53]:
audio_dir = './Dataset/array/'

f0 = np.load(audio_dir + 'male_f0.npy', allow_pickle=True)
frt = np.load(audio_dir + 'male_frt.npy', allow_pickle=True)
spec = np.load(audio_dir + 'male_spec.npy', allow_pickle=True)



In [54]:
print(spec.shape)
print(f0.shape)

default = f0[0].shape

# for e in f0:
#   if e.shape != default:
#     print(e.shape)

print(frt.shape)
print(frt[0].shape)
# for e in frt:
#   print(e.shape)

(2489, 513, 63)
(2489, 197)
(2489,)
(91,)
