In [1]:
import pandas as pd
from glob import glob
import os
import compress_json

In [2]:
from dotenv import load_dotenv

load_dotenv("../.env")

sample_rate = os.getenv("SAMPLE_RATE")
chunk_duration = int(os.getenv("CHUNK_DURATION"))
chunk_stride_factor = float(os.getenv("CHUNK_STRIDE_FACTOR"))

dataset_name = "gtzan"
output_dataset_path = os.path.join("compiled", dataset_name, "dataset.parquet")

## 1. Locate all files eligible for modelling

In [3]:

patterns = [
    os.path.join("/mnt/d/sonus_library/gtzan", "*/description.json"),
    #os.path.join("/mnt/e/sonus_library/youtube", "*/description.json"),
    #os.path.join("/mnt/e/sonus_library/raw_beatport", "*/description.json"),
]

files = []
for pattern in patterns:
    files.extend(glob(pattern))

In [4]:
files

['/mnt/d/sonus_library/gtzan/blues/description.json',
 '/mnt/d/sonus_library/gtzan/classical/description.json',
 '/mnt/d/sonus_library/gtzan/country/description.json',
 '/mnt/d/sonus_library/gtzan/disco/description.json',
 '/mnt/d/sonus_library/gtzan/hiphop/description.json',
 '/mnt/d/sonus_library/gtzan/jazz/description.json',
 '/mnt/d/sonus_library/gtzan/metal/description.json',
 '/mnt/d/sonus_library/gtzan/pop/description.json',
 '/mnt/d/sonus_library/gtzan/reggae/description.json',
 '/mnt/d/sonus_library/gtzan/rock/description.json']

In [5]:
def load_description(file_path:str):
    directory = os.path.dirname(file_path)
    data = compress_json.load(file_path)
    for datum in data:
        file_name = datum["filename"]
        datum["main_genre"] = datum["genres"][0]
        datum["file_path"] = os.path.join(directory, file_name)
    return data

In [6]:
descriptions = list(map(load_description, files))
descriptions = [d for description in descriptions for d in description]
df = pd.DataFrame(descriptions)
df.head()

Unnamed: 0,genres,stems,moods,filename,bpm,key,duration,origin,main_genre,file_path
0,[blues],[],[],blues.00000.wav,,,30,gtzan,blues,/mnt/d/sonus_library/gtzan/blues/blues.00000.wav
1,[blues],[],[],blues.00001.wav,,,30,gtzan,blues,/mnt/d/sonus_library/gtzan/blues/blues.00001.wav
2,[blues],[],[],blues.00002.wav,,,30,gtzan,blues,/mnt/d/sonus_library/gtzan/blues/blues.00002.wav
3,[blues],[],[],blues.00003.wav,,,30,gtzan,blues,/mnt/d/sonus_library/gtzan/blues/blues.00003.wav
4,[blues],[],[],blues.00004.wav,,,30,gtzan,blues,/mnt/d/sonus_library/gtzan/blues/blues.00004.wav


# 2 - audio split test/train

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['main_genre'], random_state=42)
df_train["mode"] = "train"
df_test["mode"] = "test"
df = pd.concat([df_train, df_test])

In [8]:
df["mode"].value_counts()

mode
train    799
test     200
Name: count, dtype: int64

## 3. Split audio into chunks

In [9]:
def get_chunk_endpoints(track_duration:int, chunk_duration:int, stride_factor:float=1) ->list[tuple]:
    """split the track into start and end sub intervals"""
    track_duration = int(track_duration)
    chunk_duration = int(chunk_duration)
    stride = int(chunk_duration * stride_factor)
    assert stride != 0
    
    intervals = []
    for i in range(0, track_duration, stride):
        intervals.append((i, i+chunk_duration))
    return intervals


#assign a unique identifier to each track
df["track_id"] = df.groupby(by=["file_path"]).ngroup()
assert not df["track_id"].isna().any()

#split the tracks into chunks (with or without potential overlap)
df["endpoints"] = df.apply(lambda x : get_chunk_endpoints(x["duration"], chunk_duration, chunk_stride_factor), axis=1)
df["duration"] = chunk_duration
df["sample_rate"] = sample_rate

df = df.explode("endpoints")
df.reset_index(inplace=True)
df["h5_index"] = df.index

In [10]:
df.head()

Unnamed: 0,index,genres,stems,moods,filename,bpm,key,duration,origin,main_genre,file_path,mode,track_id,endpoints,sample_rate,h5_index
0,540,[jazz],[],[],jazz.00040.wav,,,5,gtzan,jazz,/mnt/d/sonus_library/gtzan/jazz/jazz.00040.wav,train,540,"(0, 5)",16000,0
1,540,[jazz],[],[],jazz.00040.wav,,,5,gtzan,jazz,/mnt/d/sonus_library/gtzan/jazz/jazz.00040.wav,train,540,"(2, 7)",16000,1
2,540,[jazz],[],[],jazz.00040.wav,,,5,gtzan,jazz,/mnt/d/sonus_library/gtzan/jazz/jazz.00040.wav,train,540,"(4, 9)",16000,2
3,540,[jazz],[],[],jazz.00040.wav,,,5,gtzan,jazz,/mnt/d/sonus_library/gtzan/jazz/jazz.00040.wav,train,540,"(6, 11)",16000,3
4,540,[jazz],[],[],jazz.00040.wav,,,5,gtzan,jazz,/mnt/d/sonus_library/gtzan/jazz/jazz.00040.wav,train,540,"(8, 13)",16000,4


In [11]:
for k, v in dict(df["h5_index"].value_counts()).items():
    if v !=1:
        print(k)

## 4 - Save dataset & model config

In [12]:
dataset_dir = os.path.dirname(output_dataset_path)
os.makedirs(dataset_dir, exist_ok=True)
df.to_parquet(output_dataset_path)