In [2]:
import os
import shutil
import torch
import torchaudio
import torchvision.transforms.v2 as transforms
import numpy as np

from torchaudio.transforms import *
from fastai.vision.all import Image

In [19]:
GTZAN_ORIGINAL_DIR="/content/drive/MyDrive/Colab Notebooks/GSN/genres_original"
GTZAN_SPECTROGRAMS_DIR="gtzan_spectrograms"
SPECTROGRAM_TRANSFORMS = [
    MelSpectrogram(sample_rate=22050, n_fft=1024, f_min=0, f_max=10000, n_mels=128),
    AmplitudeToDB()
]
AUGMENTATION_TRANSFORMS = [

]
AUDIO_SLICES = 3

In [20]:
def create_spectrogram_image(mel_spec_transformer, wave: torch.Tensor) -> np.ndarray:
  spectrogram = mel_spec_transformer(wave)
  spectrogram = spectrogram.squeeze().numpy()
  spectrogram = (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min()) * 255;
  spectrogram = spectrogram.astype('uint8')

  return spectrogram

In [26]:
def save_spectrogram_as_image(spectrogram: np.ndarray, filename: str):
  Image.fromarray(spectrogram).save(filename)

In [22]:
def load_audio_and_cut_length(filename: str) -> torch.Tensor:
  wave, sample_rate = torchaudio.load(filename)
  cut_wave = wave[:, 0:sample_rate*30] #Audio length 30s
  return cut_wave

In [23]:
def slice_audio(wave: torch.Tensor, num_slices: int) -> list[torch.Tensor]:
  if (num_slices == 1):
    return [wave]

  possible_even_slices = [3,5,6,10]

  if (num_slices not in possible_even_slices):
    raise ValueError("num_slices must one of valid number: 2,6,10")

  output = []
  hop = wave.shape[1] // num_slices # (1, 661500)
  for i in range(num_slices):
    output.append(wave[:, i*hop : (i+1)*hop])

  return output

In [24]:
def create_file_name(filename: str, output_dir: str, apply_idx: bool, idx: int) -> str:
    genre, song_number, ext = filename.split(".")
    if apply_idx:
        song_number = song_number + f"_{idx}"
    ext = "png"

    return os.path.join(output_dir, f"{genre}.{song_number}.{ext}")

In [25]:
# Remove corrupted file
if (os.path.isfile(GTZAN_ORIGINAL_DIR + "/jazz/jazz.00054.wav")):
  os.remove(GTZAN_ORIGINAL_DIR + "/jazz/jazz.00054.wav")

if (os.path.isdir(GTZAN_SPECTROGRAMS_DIR)):
  shutil.rmtree(GTZAN_SPECTROGRAMS_DIR)

os.mkdir(GTZAN_SPECTROGRAMS_DIR)

mel_spec_transformer = transforms.Compose(SPECTROGRAM_TRANSFORMS)
apply_idx_to_file = AUDIO_SLICES > 1

for genre in os.listdir(GTZAN_ORIGINAL_DIR):
  genre_original_dir = os.path.join(GTZAN_ORIGINAL_DIR, genre)
  spectrogram_genre_dir = os.path.join(GTZAN_SPECTROGRAMS_DIR, genre)
  os.mkdir(spectrogram_genre_dir)

  print(genre_original_dir + " -> " + spectrogram_genre_dir)

  for file in os.listdir(genre_original_dir):
    wave = load_audio_and_cut_length(os.path.join(genre_original_dir, file))
    waves = slice_audio(wave, AUDIO_SLICES)

    for idx, w in enumerate(waves):
      spectrogram = create_spectrogram_image(mel_spec_transformer, w)
      save_spectrogram_as_image(spectrogram, create_file_name(file, spectrogram_genre_dir, apply_idx_to_file, idx))

shutil.make_archive(GTZAN_SPECTROGRAMS_DIR, 'zip', GTZAN_SPECTROGRAMS_DIR)

/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/classical -> gtzan_spectrograms/classical
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/rock -> gtzan_spectrograms/rock
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/jazz -> gtzan_spectrograms/jazz
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/metal -> gtzan_spectrograms/metal
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/country -> gtzan_spectrograms/country
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/reggae -> gtzan_spectrograms/reggae
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/hiphop -> gtzan_spectrograms/hiphop
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/blues -> gtzan_spectrograms/blues
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/pop -> gtzan_spectrograms/pop
/content/drive/MyDrive/Colab Notebooks/GSN/genres_original/disco -> gtzan_spectrograms/disco


'/content/gtzan_spectrograms.zip'

In [27]:
! git clone https://github.com/xpaf/MusicNet

Cloning into 'MusicNet'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 7 (delta 0), reused 7 (delta 0), pack-reused 0[K
Receiving objects: 100% (7/7), done.
