In [1]:
%matplotlib inline

import os
import math
import json
import pprint
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.ticker import FormatStrFormatter
from collections import Counter

In [2]:
matplotlib.rcParams['axes.titleweight'] = 'bold'
matplotlib.rcParams['axes.labelweight'] = 'bold'
matplotlib.rcParams['font.family'] = 'serif'

In [3]:
fig_dir = '../analysis'

def traverse_dir(root_dir, extension='.json', suffix=''):
    num_s = len(suffix)
    print('[*] Scanning...')
    file_list = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(extension):
                base = file.split('.')[0]
                if base[-num_s:] == suffix:
                    path = os.path.join(root, file)
                    file_list.append(path)               

    return file_list

def most_common(lst):
    return max(set(lst), key=lst.count)

def plot_bar(content,
            title='',
            xlabel='',
            ylabel='',
            filename='test',
            fig_dir='./',
            tail_range=(0, -1),
            is_xticks=True,
            kwargs=None,
            set_size=None,
            dpi=200):

    if not os.path.exists(fig_dir):
        os.makedirs(fig_dir)
    
    fig = plt.figure()
    plt.title(title)
    count = dict(Counter(content))
    count_ls = sorted(count.items(), key=lambda kv: kv[1], reverse=True)
    count_s = dict(count_ls[tail_range[0]:tail_range[1]])  
    plt.bar(range(len(count_s)), count_s.values(), align='center')
    xticks = count_s.keys() if is_xticks else ''
    if is_xticks:
        if kwargs:
            plt.xticks(range(len(count_s)), xticks, **kwargs)
        else:
            plt.xticks(range(len(count_s)), xticks)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    if set_size:
        plt.gcf().set_size_inches(set_size[0], set_size[1])
    plt.savefig(os.path.join(fig_dir, filename), bbox_inches='tight',dpi=dpi)
    plt.close()
    return count_s

def plot_hist(content,
            title='',
            xlabel='',
            ylabel='',
            bins=40,
            filename='test',
            fig_dir='./',
            xlim=None,
            xtick_resol=10,
            dpi=200):
    
    plt.hist(content, bins=bins)
    plt.tight_layout()
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    kwargs = {'fontsize': 9, 'rotation': 90}
    plt.xticks(np.arange(min(content), max(content)+1, xtick_resol),  **kwargs)
    if xlim:
        plt.xlim(xlim)
    plt.savefig(os.path.join(fig_dir, filename), bbox_inches='tight',dpi=dpi)
    plt.close()
    
def statsitcs(content, filename, info_dir):
    num_unique = len(set(content))
    info = {
        'num_unique': num_unique,
        'min': min(content),
        'max': max(content),
        'most common': most_common(content),
        'mean': np.mean(content),
        'median': np.median(content),
        'std': np.std(content),
    }
    
    with open(os.path.join(info_dir, filename+'.json'), "w") as f:
        json.dump(info, f)
    pprint.pprint(info)

In [6]:
# get event list
root_dir = '../datasets'
event_list = traverse_dir(root_dir, suffix='_symbol_key')

path_events = os.path.join(root_dir, 'event_list.json')
with open(path_events, "w") as f:
    json.dump(event_list, f)

with open(path_events, "r") as f:
    event_list = json.load(f)

print(len(event_list))

# load event by list
events = []
for event in event_list:
    with open(event, "r") as f:
        events.append(json.load(f))
        

[*] Scanning...
20


In [7]:
# plot bar
todo_list = [
    ([int(e['metadata']['beats_in_measure']) for e in events], 'beats_in_measure', None),
    ([e['metadata']['key'] for e in events], 'key', None),
    ([e['metadata']['mode'] for e in events], 'mode', None),
    ([e['version'] for e in events], 'version', None),
    ([int(e['num_measures']) for e in events], 'num_measures', {'fontsize': 5.5, 'rotation': 90}),
]

for item in todo_list:
    info_list = item[0]
    title = item[1]
    kwargs = item[2]
    
    count = plot_bar(info_list,
            title=title,
            ylabel='num of sections',
            filename=title,
            fig_dir=fig_dir,
            kwargs=kwargs)

print('\n=[%s]========='%title)
statsitcs(info_list, title, fig_dir)

# plot hist
todo_list = [
    ([float(e['metadata']['BPM']) for e in events], 'bpm', 40, (30, 270), 12),
    ([len(e['tracks']['melody']) for e in events], 'number_of_notes', 200, (0, 300), 10),
    ([len(e['tracks']['chord']) for e in events], 'number_of_chord', 250, (0, 100), 5),
]

for item in todo_list:
    info_list = item[0]
    title = item[1]
    bins = item[2]
    xlim = item[3]
    xtick_resol = item[4]
    print('\n=[%s]========='%title)
    plot_hist(info_list,
            title=title,
            xlabel='',
            ylabel='num of sections',
            filename=title,
            fig_dir=fig_dir,
            xlim=xlim,
            xtick_resol=xtick_resol,
            bins=bins,
            dpi=200)
    
    statsitcs(info_list, title, fig_dir)


{'max': 32,
 'mean': 11.65,
 'median': 8.0,
 'min': 1,
 'most common': 8,
 'num_unique': 9,
 'std': 6.980508577460528}

{'max': 222.0,
 'mean': 138.5,
 'median': 133.0,
 'min': 77.0,
 'most common': 128.0,
 'num_unique': 14,
 'std': 35.289516856993096}

{'max': 360,
 'mean': 72.45,
 'median': 46.5,
 'min': 0,
 'most common': 0,
 'num_unique': 16,
 'std': 91.5076362933717}

{'max': 112,
 'mean': 15.7,
 'median': 10.5,
 'min': 0,
 'most common': 8,
 'num_unique': 14,
 'std': 22.947984660967506}
