### Imports

In [1]:
import os
import glob
import pandas as pd
import shutil

#### Get list of files in breakbeat folder

In [2]:
file_list = os.listdir('data/breakbeat')

Split file name into its relevant information

In [3]:
split_file_name = file_list[0].split('__')
genre = split_file_name[0]
track_id = split_file_name[1]
spectro_id = split_file_name[2].split('.')[0]

In [4]:
list_of_files = []
for f in file_list:
    split_f = f.split('__')
    genre = split_f[0]
    track_id = split_f[1]
    spectro_id = split_f[2].split('.')[0]
    file_name = f
    
    track_dict = {
        'file_name':file_name,
        'track_id':track_id,
        'genre':genre,
        'spectro_id':spectro_id}
    
    list_of_files.append(track_dict)

In [5]:
list_of_files[0]

{'file_name': 'breakbeat__100035__000.png',
 'genre': 'breakbeat',
 'spectro_id': '000',
 'track_id': '100035'}

#### Move data to DataFrame

In [6]:
df = pd.DataFrame(list_of_files)

In [7]:
df.sample(5)

Unnamed: 0,file_name,genre,spectro_id,track_id
17794,breakbeat__409478__022.png,breakbeat,22,409478
1724,breakbeat__15013__002.png,breakbeat,2,15013
5547,breakbeat__203475__006.png,breakbeat,6,203475
230,breakbeat__108357__012.png,breakbeat,12,108357
21953,breakbeat__98266__016.png,breakbeat,16,98266


#### Calculate the number of spectrograms for each unique track

In [8]:
num_files = df.groupby('track_id')['file_name'].count().reset_index()
num_files.columns = ['track_id','num_spectro']

In [9]:
num_files.head()

Unnamed: 0,track_id,num_spectro
0,100035,17
1,10052,23
2,10073,23
3,10077,23
4,102675,17


#### Work out the number of files needed in the train, validation and holdout folders

In [10]:
total_num_files = num_files['num_spectro'].sum()

In [11]:
train_pct = 0.65
validation_pct = 0.25
holdout_pct = 0.1

In [12]:
train_image_threshold = int(train_pct * total_num_files)
validation_image_threshold = int(validation_pct * total_num_files)

In [13]:
tracks_dict = dict(zip(num_files['track_id'], num_files['num_spectro']))

#### Loop through tracks_dict and make a note of track IDs that will go into the train dataset

In [14]:
count = 0
train_ids = []
for key, value in tracks_dict.items():
    count += value
    if count <= train_image_threshold:
        train_ids.append((key, value))

Then remove these track id's from the dict...

In [15]:
for item in train_ids:
    tracks_dict.pop(item[0], None)

#### Do the same for the validation set

In [16]:
count = 0
validation_ids = []
for key, value in tracks_dict.items():
    count += value
    if count <= validation_image_threshold:
        validation_ids.append((key, value))

In [17]:
for item in validation_ids:
    tracks_dict.pop(item[0], None)

#### Then move the rest into the holdout set

In [18]:
holdout_ids = []
for key, value in tracks_dict.items():
    holdout_ids.append((key, value))

Number of tracks in each set...

In [19]:
print(len(train_ids))
print(len(validation_ids))
print(len(holdout_ids))

647
250
98


#### Create directories for train/breakbeat, validation/breakbeat and holdout/breakbeat if they don't exist

In [20]:
if not os.path.exists('data/train/breakbeat'):
    os.makedirs('data/train/breakbeat')

if not os.path.exists('data/validation/breakbeat'):
    os.makedirs('data/validation/breakbeat')

if not os.path.exists('data/holdout/breakbeat'):
    os.makedirs('data/holdout/breakbeat')

#### Move train files from data/breakbeat to data/train/breakbeat

In [21]:
for t in train_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/breakbeat/{}'.format(f)
        dst = 'data/train/breakbeat/{}'.format(f)
        shutil.move(src, dst)

#### Move validation files from data/breakbeat to data/validation/breakbeat

In [22]:
for t in validation_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/breakbeat/{}'.format(f)
        dst = 'data/validation/breakbeat/{}'.format(f)
        shutil.move(src, dst)

#### And finally the holdout files...

In [23]:
for t in holdout_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/breakbeat/{}'.format(f)
        dst = 'data/holdout/breakbeat/{}'.format(f)
        shutil.move(src, dst)