In [1]:
import util
import soundFiles
import librosa
from IPython.display import Audio
import mir_eval.sonify
import numpy as np
import soundfile

# Load trained SIREN model

Because the state dictionaries are stored instead of the full pytorch models, it is necessary
to create a Siren model with the specified number of layers, layer size, and w0 parameter in order to load 
the model's state dictionary from the file located at the given path.  

Below paths are provided for state dictionaries for models trained on 1 second segments. 
Pruned models and models used for stitching can be found in the state_dicts directory

In [2]:
# Outcomment below for the path and setup for loading a classical model
path = "state_dicts/classic/1sec/original/clipClassicPSNR.pt"
model = util.load_model_from_path(path,5,78,50)

#path = "state_dicts/classic/1sec/original/clipClassicViSQOL.pt" # (Best ViSQOL, not the model used primarily throghout the testing)
#model = util.load_model_from_path(path,17,43,49) # (Best ViSQOL, not the model used primarily throghout the testing)

# Outcomment below for the path and setup for loading the rock model
#path = "state_dicts/rock/1sec/original/clipRockPSNR.pt"
#model = util.load_model_from_path(path, 6,70,42)

#path = "state_dicts/rock/1sec/original/clipRockPSNR.pt" # (Best PSNR, not the model used primarily throghout the testing)
#model = util.load_model_from_path(path,17,43,49) # (Best PSNR, not the model used primarily throghout the testing)

# Outcomment below for the path and setup for loading the pop model
#path = "state_dicts/pop/Original/Pop.pt"
#model = util.load_model_from_path(path, 21,38,26)

# Outcomment below for the path and setup for loading the speech model
#path = "state_dicts/Speech/Original/Speech.pt"
#model = util.load_model_from_path(path, 8,35,47)

# Load audio waveform(s)

In [3]:
# The duration is changed to 10 for the reconstructed, stitched reconstruction samples.
duration = 1 

# The audioType must be "audio" for testing with Classical, Rock and Pop, and must be changed to "speech" for testing speech samples.
audioType = 'audio'
sr_dict = {'speech':16000, 'audio':48000}

# A dictionary leading to the paths of the baseline audio samples and codecs is used
# Available names are specified in the soundFiles script. 
# For example, "clipClassic" is the original audio of the classical sample, 
# "clipClassicMP3" is the MP3 compressed version of that and "clipClassic1secMP3" is the MP3 compressed version of 1 sec of the original audio.

pathAudio = soundFiles.soundfile("clipClassic")
pathAudioComp = soundFiles.soundfile("clipSpeech1secMP3") 

origWaveform,sr = librosa.load(pathAudio, mono=True, sr=sr_dict[audioType], duration=duration)
compWaveform,sr = librosa.load(pathAudioComp, mono=True, sr=sr_dict[audioType], duration=duration)

# For generating the waveform from a model and preparing it for audio playback
model_waveform = util.playAudioFromModel(model,origWaveform,duration = 1, sr = sr)

# For audio playback, change the data parameter below to the desired waveform
Audio(data=model_waveform, rate=sr)

# Compare model with waveform
For calculating ViSQOL/ViSQOLAudio and PSNR metrics between the reconstructed waveform from a trained SIREN model and an audio waveform

In [4]:
util.compareModelWithWaveform(model,origWaveform, duration = duration, sr=sr,form=audioType)

(2.2806992987665944, 33.25667858123779)

# Compare audio waveforms
For calculating ViSQOL/ViSQOLAudio and PSNR metrics between the two audio waveforms

In [5]:
util.compareWaveformWithWaveform(origWaveform, compWaveform, sr, mode=audioType)

(1.0, 20.098495483398438)

# Apply quantization to trained SIREN models
For applying 16/14/12-bit uniform quantization and half-precision quantization to a model. The function also compares it with an original waveform by calculating ViSQOL/ViSQOLAudio and PSNR metrics between the reconstructed waveform from the quantized models and an audio waveform

In [6]:
util.quantizeAndCompare(model, origWaveform, sr, duration, mode=audioType)

Uniform 16 bit: ViSQOLScore: 2.2744950475047827, PSNR: 32.77721881866455, bitrate: 388.796875
Uniform 14 bit: ViSQOLScore: 2.2624014886775075, PSNR: 28.130452632904053, bitrate: 340.197265625
Uniform 12 bit: ViSQOLScore: 2.2021506988470034, PSNR: 20.31379461288452, bitrate: 291.59765625
HALF Precision: ViSQOLScore: 2.2529701340460453, PSNR: 28.770418167114258, bitrate: 388.796875


Alternatively, uniform quantization can be applied directly to a model with arbitrary bit depth

In [7]:
quant_bit_depth = 11
quanModel = util.modelUniformQuantization(model, quant_bit_depth)
quant_waveform = util.playAudioFromModel(quanModel,origWaveform,sr, duration)
Audio(data=quant_waveform, rate=sr)

# Apply quantization to audio waveforms
For applying uniform quantization to an audio waveform. The function also compares it with the original waveform by calculating ViSQOL/ViSQOLAudio and PSNR metrics between the quantized waveform and the original audio waveform

In [8]:
quant_bit_depth = 16
util.quantizeAudioSampleAndCompare(origWaveform, mode = audioType, sample_rate = sr_dict[audioType], bits = quant_bit_depth)

quant_bit_depth = 8
util.quantizeAudioSampleAndCompare(origWaveform, mode = audioType, sample_rate = sr_dict[audioType], bits = quant_bit_depth)


Uniform Quantization 16 bit: ViSQOLScore: 4.7320990111117, PSNR: 26.944007188966985
Uniform Quantization 8 bit: ViSQOLScore: 4.297257667764697, PSNR: 26.649175105262287
