In [2]:
import os
from pathlib import Path
import requests
import tarfile
from tqdm import tqdm
import json
import numpy as np
import shutil

# Select which dataset you want to download, define seed and split ratio

In [2]:
dataset = "valid" # train, valid or test
np.random.seed(42)
split_ratio = 0.7

In [3]:
path = Path("dataset/nsynth-" + dataset)
if path.is_dir() and (path / "examples.json").is_file():
    print("Dataset already downloaded/extracted. Attempting to create train-test split.")
else:
    dlpath = Path(str(path.absolute()) + ".jsonwav.tar.gz")
    if not dlpath.is_file():
        print("Downloading dataset ({}).".format(dataset))
        url = "http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-{}.jsonwav.tar.gz".format(dataset)
        print(url)
        r = requests.get(url, stream=True)
        total_size = int(r.headers.get('content-length', 0))
        block_size = 1024
        t = tqdm(total=total_size, unit='iB', unit_scale=True)
        with open(str(path.absolute()) + ".jsonwav.tar.gz", 'wb') as f:
            for data in r.iter_content(block_size):
                t.update(len(data))
                f.write(data)
        t.close()
        if total_size != 0 and t.n != total_size:
            print("Something went wrong with the download. If you download it manually into the dataset folder things will still work.")
    print("Extracting...")
    tar = tarfile.open(dlpath, 'r:gz')
    tar.extractall("dataset/")
    print("Creating train-test split. You may now delete the .tar.gz archives.")
    
with open(path / "examples.json") as f:
    jsondict = json.load(f)
    
keys = np.array(list(jsondict.keys()))
splits = [i.split('-') for i in keys]
instrument_str = [i[0] for i in splits]

# Get unique instruments in dataset
unique_inst = list(set(instrument_str))

# Make list of numpy arrays with all strings belonging to one instrument
instruments = []
for inst in unique_inst:
    instruments.append(keys[np.array(instrument_str) == inst])

# Create tr/te lists with random permutation (according to seed)
train = []
test = []
for inst in instruments:
    r = np.random.permutation(len(inst))
    i = int(len(inst) * split_ratio)
    train.extend(inst[r[:i]])
    test.extend(inst[r[i:]])

# Dict comprehension to get train-test split according to 
traind = {k:jsondict[k] for k in train}
testd = {k:jsondict[k] for k in test}

# Write new json files
with open(path / "train.json", "w") as f:
    json.dump(traind, f, indent=2)
with open(path / "test.json", "w") as f:
    json.dump(testd, f, indent=2)
    
# Create directories and copy .wav files
os.makedirs(path / "train", exist_ok=True)
os.makedirs(path / "test", exist_ok=True)
trainw = [x + ".wav" for x in train]
testw = [x + ".wav" for x in test]
for file in trainw:
    shutil.move(path / "audio" / file, path / "train" / file)
for file in testw:
    shutil.move(path / "audio" / file, path / "test" / file)
    
print("Successfully split the dataset. The folder 'audio' should now be empty. To reverse, just move all files from train/test folders back into audio and remove train/test(.json).")

Downloading dataset (valid).
http://download.magenta.tensorflow.org/datasets/nsynth/nsynth-valid.jsonwav.tar.gz


100%|██████████| 1.07G/1.07G [04:21<00:00, 4.09MiB/s]


Extracting...
Creating train-test split. You may now delete the .tar.gz archives.
Successfully split the dataset. The folder 'audio' should now be empty. To reverse, just move all files from train/test folders back into audio and remove train/test(.json).


In [3]:
dataset = "valid" # train, valid or test
np.random.seed(42)
split_ratio = 0.7
path = Path("dataset/nsynth-" + dataset)


In [4]:
with open(path / "examples.json") as f:
    jsondict = json.load(f)
    
keys = np.array(list(jsondict.keys()))
splits = [i.split('-') for i in keys]
instrument_str = [i[0] for i in splits]

# Get unique instruments in dataset
unique_inst = list(set(instrument_str))

# Make list of numpy arrays with all strings belonging to one instrument
instruments = []
for inst in unique_inst:
    instruments.append(keys[np.array(instrument_str) == inst])

# Create tr/te lists with random permutation (according to seed)
train = []
test = []
for inst in instruments:
    r = np.random.permutation(len(inst))
    i = int(len(inst) * split_ratio)
    train.extend(inst[r[:i]])
    test.extend(inst[r[i:]])

# Dict comprehension to get train-test split according to 
traind = {k:jsondict[k] for k in train}
testd = {k:jsondict[k] for k in test}

In [7]:
list(traind.keys())

['guitar_acoustic_030-039-075',
 'guitar_acoustic_030-026-127',
 'guitar_acoustic_030-023-050',
 'guitar_acoustic_030-028-050',
 'guitar_acoustic_030-094-100',
 'guitar_acoustic_030-072-025',
 'guitar_acoustic_030-086-127',
 'guitar_acoustic_030-050-025',
 'guitar_acoustic_030-043-050',
 'guitar_acoustic_030-097-025',
 'guitar_acoustic_030-041-075',
 'guitar_acoustic_030-075-127',
 'guitar_acoustic_030-045-127',
 'guitar_acoustic_030-050-127',
 'guitar_acoustic_030-064-127',
 'guitar_acoustic_030-107-025',
 'guitar_acoustic_030-075-025',
 'guitar_acoustic_030-034-100',
 'guitar_acoustic_030-103-025',
 'guitar_acoustic_030-069-127',
 'guitar_acoustic_030-102-075',
 'guitar_acoustic_030-043-025',
 'guitar_acoustic_030-057-075',
 'guitar_acoustic_030-055-050',
 'guitar_acoustic_030-091-025',
 'guitar_acoustic_030-080-100',
 'guitar_acoustic_030-093-050',
 'guitar_acoustic_030-058-075',
 'guitar_acoustic_030-094-050',
 'guitar_acoustic_030-038-025',
 'guitar_acoustic_030-083-025',
 'guitar