In [141]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from PIL import Image
import resampy

In [142]:
def audio_to_spectrogram(audio_file, output_dir):
    # Load the audio file
    audio, sr = librosa.load(
        audio_file, sr=None
    )  # sr=None preserves the original sampling rate

    # Create the spectrogram
    S = librosa.feature.melspectrogram(y=audio, sr=sr)
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Set the file name for the spectrogram
    fname = os.path.basename(audio_file).rsplit(".")[0] + ".png"
    spectrogram_file = os.path.join(output_dir, fname)

    # Plot and save the spectrogram as an image
    plt.figure(figsize=(8, 8))  # Adjust the figure size as needed
    librosa.display.specshow(S_dB)
    plt.savefig(spectrogram_file, bbox_inches="tight", pad_inches=0)
    plt.close()

    return spectrogram_file

In [143]:
# based on https://github.com/pietz/language-recognition/blob/master/Language%20Classifier.ipynb
def mp3_to_img(path, height=192, width=192):
    signal, sr = librosa.load(path)
    signal = resampy.resample(signal, sr, 16000, res_type="kaiser_fast")
    hl = signal.shape[0] // (width * 1.1) # Cut away 5% from start and end

    spec = librosa.feature.melspectrogram(y=signal, n_mels=height, hop_length=int(hl))

    img_db = librosa.power_to_db(spec, ref=np.max)  # img = lr.logamplitude(spec)**2

    start = (img_db.shape[1] - width) // 2
    fname = os.path.basename(path).rsplit(".")[0]

    return img_db[:, start : start + width], fname


# https://stackoverflow.com/questions/56719138/how-can-i-save-a-librosa-spectrogram-plot-as-a-specific-sized-image/57204349#57204349
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled


# https://stackoverflow.com/questions/56719138/how-can-i-save-a-librosa-spectrogram-plot-as-a-specific-sized-image/57204349#57204349
def save_spec(spec, out):
    # min-max scale to fit inside 8-bit range
    spec = scale_minmax(spec, 0, 255).astype(np.uint8)
    img = np.flip(spec, axis=0)  # put low frequencies at the bottom in image
    img = 255 - img  # invert. make black==more energy

    # save with PIL
    Image.fromarray(img).save(out)

In [144]:
def make_spects(input_dir, output_dir, num_files=100, shuffle=True):
    if not os.path.exists(input_dir):
        print("Input directory does not exist.")
        return

    os.makedirs(output_dir, exist_ok=True)

    files = os.listdir(input_dir)
    if shuffle:
        np.random.shuffle(files)

    files = files[:num_files]

    for file in tqdm(files):
        if file.endswith(".mp3"):
            audio_file = os.path.join(input_dir, file)
            img, fname = mp3_to_img(path=audio_file, height=192, width=192)
            save_spec(img, os.path.join(output_dir, fname) + ".jpg")


langs = ["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ru", "zh-CN"]

for lang in langs:
    input_dir = f"data/mp3/{lang}_train_0/"
    output_dir = f"data/spectrogram/{lang}_train/"
    print(f"Converting mp3 files in `{input_dir}` to spectrograms in `{output_dir}`")
    make_spects(input_dir, output_dir, num_files=2000, shuffle=False)

Converting mp3 files in `data/mp3/en_train_0/` to spectrograms in `data/spectrogram/en_train/`


100%|██████████| 1000/1000 [01:11<00:00, 13.90it/s]


Converting mp3 files in `data/mp3/zh-CN_train_0/` to spectrograms in `data/spectrogram/zh-CN_train/`


100%|██████████| 1000/1000 [01:07<00:00, 14.90it/s]
