### Imports

In [6]:
import os
import glob
import pandas as pd
import shutil
from boto.s3.connection import S3Connection
from boto.s3.key import Key

#### Get list of files in genre folder

In [18]:
s3dir = '~/home/ubuntu/s3bucket'
data_dir = '~/home/ubuntu/Notebooks/data'

In [2]:
genre_name = 'deep_house'

In [3]:
file_list = os.listdir('data/{}'.format(genre_name))

Split file name into its relevant information

In [4]:
split_file_name = file_list[0].split('__')
genre = split_file_name[0]
track_id = split_file_name[1]
spectro_id = split_file_name[2].split('.')[0]

In [5]:
list_of_files = []
for f in file_list:
    split_f = f.split('__')
    genre = split_f[0]
    track_id = split_f[1]
    spectro_id = split_f[2].split('.')[0]
    file_name = f
    
    track_dict = {
        'file_name':file_name,
        'track_id':track_id,
        'genre':genre,
        'spectro_id':spectro_id}
    
    list_of_files.append(track_dict)

In [6]:
list_of_files[0]

{'file_name': 'dancehall_ragga__101011__000.png',
 'genre': 'dancehall_ragga',
 'spectro_id': '000',
 'track_id': '101011'}

#### Move data to DataFrame

In [7]:
df = pd.DataFrame(list_of_files)

In [8]:
df.sample(5)

Unnamed: 0,file_name,genre,spectro_id,track_id
3877,dancehall_ragga__206627__012.png,dancehall_ragga,12,206627
8174,dancehall_ragga__261283__011.png,dancehall_ragga,11,261283
4365,dancehall_ragga__215565__017.png,dancehall_ragga,17,215565
17010,dancehall_ragga__407318__000.png,dancehall_ragga,0,407318
6719,dancehall_ragga__239718__002.png,dancehall_ragga,2,239718


#### Calculate the number of spectrograms for each unique track

In [9]:
num_files = df.groupby('track_id')['file_name'].count().reset_index()
num_files.columns = ['track_id','num_spectro']

In [10]:
num_files.head()

Unnamed: 0,track_id,num_spectro
0,101011,23
1,101053,23
2,104703,23
3,109111,23
4,109112,23


#### Work out the number of files needed in the train, validation and holdout folders

In [11]:
total_num_files = num_files['num_spectro'].sum()

In [12]:
train_pct = 0.65
validation_pct = 0.25
holdout_pct = 0.1

In [13]:
train_image_threshold = int(train_pct * total_num_files)
validation_image_threshold = int(validation_pct * total_num_files)

In [14]:
tracks_dict = dict(zip(num_files['track_id'], num_files['num_spectro']))

#### Loop through tracks_dict and make a note of track IDs that will go into the train dataset

In [15]:
count = 0
train_ids = []
for key, value in tracks_dict.items():
    count += value
    if count <= train_image_threshold:
        train_ids.append((key, value))

Then remove these track id's from the dict...

In [16]:
for item in train_ids:
    tracks_dict.pop(item[0], None)

#### Do the same for the validation set

In [17]:
count = 0
validation_ids = []
for key, value in tracks_dict.items():
    count += value
    if count <= validation_image_threshold:
        validation_ids.append((key, value))

In [18]:
for item in validation_ids:
    tracks_dict.pop(item[0], None)

#### Then move the rest into the holdout set

In [19]:
holdout_ids = []
for key, value in tracks_dict.items():
    holdout_ids.append((key, value))

Number of tracks in each set...

In [20]:
print(len(train_ids))
print(len(validation_ids))
print(len(holdout_ids))

643
248
100


#### Create directories for train/breakbeat, validation/breakbeat and holdout/breakbeat if they don't exist

In [21]:
train_dir = 'data/train/{}'.format(genre_name)
validation_dir = 'data/validation/{}'.format(genre_name)
holdout_dir = 'data/holdout/{}'.format(genre_name)


if not os.path.exists(train_dir):
    os.makedirs(train_dir)

if not os.path.exists(validation_dir):
    os.makedirs(validation_dir)

if not os.path.exists(holdout_dir):
    os.makedirs(holdout_dir)

#### Move train files from data/breakbeat to data/train/breakbeat

In [22]:
for t in train_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/{}/{}'.format(genre_name, f)
        dst = 'data/train/{}/{}'.format(genre_name, f)
        shutil.move(src, dst)

#### Move validation files from data/breakbeat to data/validation/breakbeat

In [23]:
for t in validation_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/{}/{}'.format(genre_name, f)
        dst = 'data/validation/{}/{}'.format(genre_name, f)
        shutil.move(src, dst)

#### And finally the holdout files...

In [25]:
for t in holdout_ids:
    files_to_move = list(df[df['track_id'] == t[0]]['file_name'])
    for f in files_to_move:
        src = 'data/{}/{}'.format(genre_name, f)
        dst = 'data/holdout/{}/{}'.format(genre_name, f)
        shutil.move(src, dst)