In [1]:
from glob import glob
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
import soundfile as sf
import pandas as pd
from matplotlib import cm
import math
import matplotlib.colors as colors

# import cv2
from PIL import Image as PilImage
from IPython.display import display
from datasets import Dataset, load_dataset, Image, Audio, Features, DatasetInfo
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# cmap = colors.LinearSegmentedColormap.from_list("my_cmap", ["black", "purple", "blue", "green", "yellow", "orange", "red", "white"])
cmap = cm.get_cmap('hsv')
norm = colors.Normalize(vmin=-80, vmax=0, clip=True)
mapper = cm.ScalarMappable(norm=norm, cmap=cmap)

In [3]:
audio_file_path = "./datasets/ERC-50/audio/3-130330-A-22.wav"

y, sr = librosa.core.load(audio_file_path, sr=22050)
print(y.shape)
melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=256, hop_length=431)
melspec = librosa.power_to_db(melspec, ref=np.max)

melspec = (melspec + 80) / 80 * 255
im = PilImage.fromarray(melspec.astype(np.uint8))
im.show()
im.save('image_file_path.png')

print(melspec.shape)
print(melspec)


(110250,)
(256, 256)
[[143.91809  134.58466   96.3939   ...   0.         0.         0.      ]
 [145.38023  134.00594  122.83912  ...   0.         0.         0.      ]
 [153.53192  153.0042   138.32568  ...   0.         0.         0.      ]
 ...
 [ 71.707054  64.18032   38.123325 ...   0.         0.         0.      ]
 [ 54.99279   41.94863    0.       ...   0.         0.         0.      ]
 [ 54.136528  40.550453   0.       ...   0.         0.         0.      ]]


In [4]:
# Initialize an empty 3D image array
image = np.zeros((melspec.shape[0], melspec.shape[1], 4), dtype=np.uint8)

# Iterate over the elements of the 2D array and assign the corresponding color
for i in range(melspec.shape[0]):
    for j in range(melspec.shape[1]):
        image[i, j, :] = mapper.to_rgba(melspec[i, j], bytes=True)

im = PilImage.fromarray(image)
im.show()

In [5]:
def get_value_from_cm(color, cmap, colrange):
    color = np.array(color)/255. 
    r = np.linspace(colrange[0], colrange[1], 256) 
    norm = colors.Normalize(colrange[0], colrange[1])
    mapvals = cmap(norm(r))[:, :4]
    distance = np.sum((mapvals - color) ** 2, axis=1)
    return r[np.argmin(distance)]

spec = np.zeros((im.height, im.width))
for i in range(im.height):
    for j in range(im.width):
        spec[i][j] = get_value_from_cm(im.getpixel((j, i)), cmap, colrange=[-80, 0])

print(spec.shape)
print(spec)



(256, 256)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
out_spec = librosa.db_to_power(spec)
y = librosa.feature.inverse.mel_to_audio(out_spec, sr=22050, hop_length=431)
sf.write(f"out.wav", y, sr)