In [1]:
import os

import numpy as np
import pandas as pd

from utils import DS_PATH

%matplotlib inline

pd.options.display.max_rows = None
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

In [2]:
TRACKS_PATH = os.path.join(DS_PATH, 'tracks_filtered_small.csv')
tracks = pd.read_csv(TRACKS_PATH)
tracks.head(10)

Unnamed: 0,track_id,split,genre
0,2,training,Hip-Hop
1,5,training,Hip-Hop
2,10,training,Pop
3,140,training,Folk
4,141,training,Folk
5,148,validation,Experimental
6,182,test,Rock
7,190,training,Folk
8,193,training,Folk
9,194,training,Folk


In [3]:
AUDIO_PATH = os.path.join(os.getcwd(), 'fma_small')

viable_tracks_ids = []
for dir_el in os.listdir(AUDIO_PATH):
    if dir_el in ('checksums', 'README.txt'): continue
    for track_filename in os.listdir(os.path.join(AUDIO_PATH, dir_el)):
        while track_filename.startswith('0'):
            track_filename = track_filename[1:]
        viable_tracks_ids.append(int(track_filename.removesuffix('.mp3')))
viable_tracks_ids

[255,
 704,
 2,
 5,
 10,
 140,
 141,
 148,
 182,
 190,
 193,
 194,
 197,
 200,
 203,
 204,
 207,
 210,
 211,
 212,
 213,
 256,
 368,
 424,
 459,
 534,
 540,
 546,
 574,
 602,
 615,
 620,
 621,
 625,
 666,
 667,
 676,
 690,
 694,
 695,
 705,
 706,
 707,
 708,
 709,
 714,
 715,
 716,
 718,
 777,
 814,
 821,
 822,
 825,
 853,
 890,
 892,
 897,
 993,
 995,
 997,
 998,
 1278,
 1686,
 1039,
 1040,
 1066,
 1069,
 1073,
 1075,
 1082,
 1083,
 1087,
 1102,
 1193,
 1195,
 1196,
 1197,
 1249,
 1259,
 1270,
 1276,
 1277,
 1417,
 1427,
 1443,
 1482,
 1510,
 1544,
 1642,
 1644,
 1649,
 1661,
 1663,
 1666,
 1673,
 1680,
 1681,
 1682,
 1683,
 1684,
 1685,
 1687,
 1688,
 1689,
 1701,
 1702,
 1703,
 1704,
 1706,
 1720,
 1732,
 1733,
 1735,
 1736,
 1883,
 1891,
 1893,
 1924,
 1925,
 1929,
 1930,
 2012,
 2096,
 2097,
 2099,
 3707,
 3263,
 3264,
 3265,
 3266,
 3270,
 3271,
 3272,
 3273,
 3274,
 3492,
 3532,
 3533,
 3534,
 3535,
 3537,
 3538,
 3573,
 3598,
 3624,
 3708,
 3720,
 3721,
 3722,
 3724,
 3725,
 37

In [4]:
tracks.set_index('track_id', inplace=True)

In [5]:
assert set(viable_tracks_ids).intersection(tracks.index) == set(tracks.index).intersection(viable_tracks_ids)
viable_tracks_ids = list(set(viable_tracks_ids).intersection(tracks.index))
print(tracks.shape)
tracks = tracks.loc[viable_tracks_ids]
tracks.shape

(8000, 2)


(8000, 2)

In [6]:
tracks.head(10)

Unnamed: 0_level_0,split,genre
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,training,Hip-Hop
5,training,Hip-Hop
10,training,Pop
32800,training,Folk
98339,training,International
98346,test,International
98348,test,International
98349,test,International
65619,training,Rock
131166,training,Rock


In [7]:
assert all(idx in viable_tracks_ids for idx in tracks.index)

In [8]:
tracks.reset_index(inplace=True)
tracks.head(1)

Unnamed: 0,track_id,split,genre
0,2,training,Hip-Hop


In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(np.unique(tracks['genre']))
tracks['genre'] = encoder.transform(tracks['genre'])
tracks['genre'].sample(5)

671     6
954     2
905     5
5060    2
4120    3
Name: genre, dtype: int64

In [10]:
encoder.inverse_transform(tracks['genre'][:5])

array(['Hip-Hop', 'Hip-Hop', 'Pop', 'Folk', 'International'], dtype=object)

In [11]:
SPLIT_DURATION = 3.0
REPOPULATE = True

In [12]:
import multiprocessing as mp
from tqdm.notebook import tqdm
from spectrograms import load_audio_file, generate_track_melspectrograms, pickle_spectrograms_set
tqdm.pandas()


def create_spectrograms_for_set(set_tracks_ids: pd.Index, split_duration: float = SPLIT_DURATION, max_shape: tuple[int, int] = (0, 0)) -> tuple[np.ndarray, np.ndarray]:
    X = []
    y = []
    set_tracks = tracks.loc[set_tracks_ids, 'track_id']
    
    with mp.Pool(os.cpu_count() - 1) as pool:
        for idx, track_id in tqdm(set_tracks.items(), total=len(set_tracks)):
            audio, sampling_rate = load_audio_file(track_id)
            if not isinstance(audio, np.ndarray):
                print(f'{track_id=} cannot be loaded')
                continue
            
            track_spectrograms = pool.apply(generate_track_melspectrograms, args=(audio, sampling_rate, split_duration,))

            if not isinstance(track_spectrograms, np.ndarray): continue
            
            for track_spectrogram in track_spectrograms:
                if track_spectrogram.shape > max_shape:
                    max_shape = track_spectrogram.shape
                X.append(track_spectrogram)
            
            track_genre = tracks.loc[idx, 'genre']
            for _ in range(len(track_spectrograms)):
                y.append(track_genre)
        
        for idx, track_spectrogram in enumerate(X):
            if track_spectrogram.shape == max_shape: continue
            X[idx] = np.resize(track_spectrogram, max_shape)
            
    return np.array(X, dtype=np.float16), np.array(y, dtype=np.int8), max_shape

In [13]:
X, y, max_shape = create_spectrograms_for_set(tracks.index)
X.shape, y.shape

  0%|          | 0/8000 [00:00<?, ?it/s]

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  return librosa.load(file_path, sr=SAMPLING_RATE, mono=True)
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).
  return librosa.load(file_path, sr=SAMPLING_RATE, mono=True)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing r

track_id=99134 cannot be loaded


  return librosa.load(file_path, sr=SAMPLING_RATE, mono=True)


track_id=133297 cannot be loaded


  return librosa.load(file_path, sr=SAMPLING_RATE, mono=True)


track_id=108925 cannot be loaded


[src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1771] error: part2_3_length (3328) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!


((75252, 128, 130), (75252,))

In [14]:
tracks[:5]

Unnamed: 0,track_id,split,genre
0,2,training,3
1,5,training,3
2,10,training,6
3,32800,training,2
4,98339,training,5


In [15]:
encoder.inverse_transform(tracks['genre'][:5])

array(['Hip-Hop', 'Hip-Hop', 'Pop', 'Folk', 'International'], dtype=object)

In [16]:
y[::10]

array([3, 3, 6, ..., 5, 5, 5], dtype=int8)

In [17]:
encoder.inverse_transform(y[:50])

array(['Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop', 'Hip-Hop',
       'Hip-Hop', 'Pop', 'Pop', 'Pop', 'Pop', 'Pop', 'Pop', 'Pop', 'Pop',
       'Pop', 'Folk', 'Folk', 'Folk', 'Folk', 'Folk', 'Folk', 'Folk',
       'Folk', 'Folk', 'Folk', 'International', 'International',
       'International', 'International', 'International', 'International',
       'International', 'International', 'International', 'International',
       'International', 'International'], dtype=object)

In [18]:
y = encoder.inverse_transform(y)
y[:50:10]

array(['Hip-Hop', 'Hip-Hop', 'Pop', 'Folk', 'International'], dtype=object)

In [19]:
from sklearn.model_selection import train_test_split

train_tracks, val_tracks, train_labels, val_labels = train_test_split(X, y, test_size=0.3, stratify=y)
val_tracks, test_tracks, val_labels, test_labels = train_test_split(val_tracks, val_labels, test_size=0.33, stratify=val_labels)
train_tracks.shape, train_labels.shape, val_tracks.shape, val_labels.shape, test_tracks.shape, test_labels.shape

((52676, 128, 130),
 (52676,),
 (15125, 128, 130),
 (15125,),
 (7451, 128, 130),
 (7451,))

In [20]:
from utils import draw_pie

for set_labels in (train_labels, val_labels, test_labels):
    draw_pie(pd.DataFrame(set_labels, columns=['genre']), 'genre')

8 values left


8 values left


8 values left


In [21]:
train_labels = pd.get_dummies(train_labels, prefix='', prefix_sep='')
train_labels.head(10)

Unnamed: 0,Electronic,Experimental,Folk,Hip-Hop,Instrumental,International,Pop,Rock
0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,1,0
6,0,0,0,1,0,0,0,0
7,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,1,0
9,0,0,0,0,0,1,0,0


In [22]:
train_labels.columns[(train_labels == 0).all()]

Index([], dtype='object')

In [23]:
pickle_spectrograms_set(train_tracks, train_labels, 'train_small')

In [24]:
del train_tracks, train_labels

In [25]:
val_labels = pd.get_dummies(val_labels, prefix='', prefix_sep='')
val_labels.head(10)

Unnamed: 0,Electronic,Experimental,Folk,Hip-Hop,Instrumental,International,Pop,Rock
0,0,0,0,0,0,1,0,0
1,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1,0
5,0,0,0,1,0,0,0,0
6,0,0,0,0,0,1,0,0
7,1,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0
9,0,0,0,0,0,0,0,1


In [26]:
val_labels.columns[(val_labels == 0).all()]

Index([], dtype='object')

In [27]:
pickle_spectrograms_set(val_tracks, val_labels, 'validation_small')

In [28]:
del val_tracks, val_labels

In [29]:
test_labels = pd.get_dummies(test_labels, prefix='', prefix_sep='')
test_labels.head(10)

Unnamed: 0,Electronic,Experimental,Folk,Hip-Hop,Instrumental,International,Pop,Rock
0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0
5,0,0,1,0,0,0,0,0
6,1,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0


In [30]:
test_labels.columns[(test_labels == 0).all()]

Index([], dtype='object')

In [31]:
pickle_spectrograms_set(test_tracks, test_labels, 'test_small')

In [None]:
del test_tracks, test_labels