In [33]:
import logging
logging.basicConfig(level = logging.INFO , format=
        '%(asctime)s:%(levelname)s:%(name)s:%(threadName)s:line %(lineno)d: %(message)s')
logger = logging.getLogger(__name__)

import cPickle
import numpy as np
import math
import json
import sys
import pandas as pd

import keras
from keras.layers import Input, Embedding, Conv1D, GlobalMaxPool1D, Dense, GlobalAvgPool1D, Dropout
from keras.layers import concatenate
from keras.models import Model
from keras.preprocessing import sequence
from keras import regularizers
from keras.engine.topology import Layer
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger

from util import plot_model, plot_metric, save_code, fill_dict
from util.archiver import get_archiver
import config as c

MAX_CHORDS = None
MAX_LABELS = None
NUM_NOTES = 88
NUM_DIM = 1024

M1 = M2 = W = b2 = None
data= train= test= valid= MAX_CHORDS = None
labels= y_train= y_test= y_valid= MAX_LABELS= index2label= labels2index = None
train_weights= None

class LogSumExpPooling(Layer):

    def call(self, x):
        # could be axis 0 or 1
        return tf.reduce_logsumexp(x, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[:1]+input_shape[2:]

def get_conv_stack(input_layer, filters, kernel_sizes, activation, kernel_l2_regularization, dropout_rate):
    layers = [Conv1D(activation=activation, padding='same', strides=1, filters=filters, kernel_size = size,
                kernel_regularizer=regularizers.l2(kernel_l2_regularization))(input_layer) for size in kernel_sizes]
    if (len(layers) <= 0):
        return input_layer
    elif (len(layers) == 1):
        return Dropout(dropout_rate, noise_shape=None, seed=None)(layers[0])
    else:
        return Dropout(dropout_rate, noise_shape=None, seed=None)(concatenate(layers))

def get_model(embeddings=True, dilated_convs=False):
    params = {k:v for k,v in locals().iteritems() if k!='weights'}
    x = Input(shape=(MAX_CHORDS,NUM_NOTES), dtype='float32')
    if embeddings:
        y1 = Dense(NUM_DIM, activation='linear', use_bias=False, weights=[M1], trainable=False)(x)
    else:
        y1 = x
    y2 = get_conv_stack(y1, 5, range(1,4), 'relu', 0.00001, 0.5)
    y3 = GlobalMaxPool1D()(y2)
    y = Dense(MAX_LABELS, activation='sigmoid')(y3)
    model = Model(x, y)
    adam = Adam(lr=c.lr)
    model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=c.metrics)
    return (model, params)

def load_embeddings(embeddings_path='/home/yg2482/code/chord2vec/data/chord2vec_199.npz'):
    logger.debug('loading embeddings from: '+embeddings_path)
    global M1, M2, W, b2
    npzf = np.load(embeddings_path)
    M1 = npzf['wM1']
    M2 = npzf['wM2']
    W = npzf['wW']
    b2 = npzf['bM2']

def indices2multihot(x, r, dtype=np.float32):
    v = np.zeros(r, dtype=dtype)
    # x should belong to [1, 88]
    x = filter(lambda x: x>0, x)
    x = filter(lambda x: x<=r, x)
    # decrease x to make in [0, 87]
    x = map(lambda x: int(x-1), x)
    v[x] = 1
    return v

def square3D(x, maxlen=None, dtype=np.float32):
    if maxlen is None:
        maxlen = []
        maxlen.append(len(x))
        maxlen.append(max([0]+[len(song) for song in x]))
        maxlen.append(max([0]+[max([0]+[len(chord) for chord in song]) for song in x]))

    x_np = np.zeros(maxlen, dtype=dtype)

    for i in range(maxlen[0]):
        for j in range(maxlen[1]):
            for k in range(maxlen[2]):
                try:
                    x_np[i][j][k] = x[i][j][k]
                except IndexError:
                    break
    return x_np

def multihot3D(x, r, maxlen=None, dtype=np.float32):
    f1D = lambda chord: indices2multihot(chord,r,dtype)
    f2D = lambda song:map(f1D, song)
    x_mh = map(f2D, x)
    return x_mh
    # return square3D(x_mh, maxlen=maxlen, dtype=dtype)

def load_data(x_datapath='data/X.pickle', y_datapath='data/y.pickle', cut=1.0,
        load_train=True, train_params_path='data/train_params.npz', filter_majority=False):
    '''
    x_datapath : path for X.pickle
    y_datapath : path for y.pickle
    cut : fraction in [0.0, 1.0] to load less data if required.
    '''
    global data, train, test, valid, MAX_CHORDS
    global labels, y_train, y_test, y_valid, MAX_LABELS, index2label, labels2index
    global train_weights

    logger.debug('loading data from: '+x_datapath)
    data = cPickle.load(open(x_datapath))
    train = data['train'] if load_train else None
    test, valid = data['test'], data['valid']
    if(cut<1.0):
        cutf = lambda x, c: x[:int(len(x)*cut)]
        train = cutf(train, cut)
        valid = cutf(valid, cut)
        test = cutf(test, cut)
        data2 = {'train':train, 'valid':valid, 'test':test}
        cPickle.dump(data2, open(x_datapath+str(cut)+'.pickle', 'w'))
        
    train = multihot3D(train, NUM_NOTES) if load_train else None
    test  = multihot3D(test, NUM_NOTES)
    valid = multihot3D(valid, NUM_NOTES)
    maxlen2D = lambda x : max([len(s) for s in x])
    MAX_CHORDS = max( map(maxlen2D, [train, test, valid]))
    # TODO: NORMALIZE!!!

    logger.debug('loading labels from: '+y_datapath)
    labels = cPickle.load(open(y_datapath))
    if(cut<1.0):
        cutf = lambda x, c: x[:int(len(x)*cut)]
        train = cutf(labels['train'], cut)
        valid = cutf(labels['valid'], cut)
        test = cutf( labels['test'], cut)
        labels2 = {'train':train, 'valid':valid, 'test':test}
        cPickle.dump(labels2, open(y_datapath+str(cut)+'.pickle', 'w'))

    s = set()
    for k,v in labels.iteritems():
        for y in v:
            s.add(y)
    l = list(enumerate(s))
    _index2label = {k:v for k,v in l}
    index2label =  lambda x : _index2label[x]
    _labels2index = {v:k for k,v in l}
    labels2index = lambda x : _labels2index[x]

    MAX_LABELS = len(_labels2index)

    y_train = to_categorical(map(labels2index, labels['train']), MAX_LABELS)
    y_test = to_categorical(map(labels2index, labels['test']), MAX_LABELS)
    y_valid = to_categorical(map(labels2index, labels['valid']), MAX_LABELS)
    train_weights = dict(enumerate(np.load(train_params_path)['train_weights']))
    
    if(filter_majority):
        def filter_majority_function(X,y):
            idxs = map(lambda x:x[1]==1,y)
            filter_indices = lambda data : [x for i,x in enumerate(data) if i in idxs]
            return filter_indices(X), filter_indices(y)
        (train, y_train) = filter_majority_function(train, y_train)
        (test, y_test) = filter_majority_function(test, y_test)
        (valid, y_valid) = filter_majority_function(valid, y_valid)

class DataManager():
    def __init__(self, inputs, targets, batch_size=128, maxepochs=10, transforms=lambda x:x):
        self.datasize = len(inputs)
        assert self.datasize == len(targets), 'size of targets should be the same as inputs'
        self.inputs = inputs
        self.targets = targets
        self.batch_size = batch_size
        self.maxepochs = maxepochs
        self.num_batches = int(math.ceil(float(self.datasize)/batch_size))
        if(callable(transforms)):
            transforms = [transforms, transforms]
        assert type(transforms)==list, 'transforms should be a *callable* or *list* of two callables'
        assert len(transforms)==2, 'transforms should be a callable or list of *two* callables'
        assert callable(transforms[0]) & callable(transforms[0]), 'transforms should be a callable or list of two *callables*'
        self.inputs_transform = transforms[0]
        self.targets_transform = transforms[1]
        logger.info('created a DataManager for batch_size: {}, maxepochs: {}, num_batches: {}'.format(batch_size, maxepochs, self.num_batches))

    def batch_generator(self):
        for epoch in range(self.maxepochs):
            for i in range(self.num_batches):
                logger.debug('loading batch {} of {}, epoch {}'.format(i, self.num_batches, epoch))
                start = i*self.batch_size
                end = (i+1)*self.batch_size
                inputs_batch =  self.inputs_transform(self.inputs[start:end])
                targets_batch =  self.targets_transform(self.targets[start:end])
                yield (inputs_batch, targets_batch)

def save_history(history, dirpath):
    with open(dirpath+'/training.json', 'w') as f:
        json.dump(history.params, f, indent=2)

    df = pd.DataFrame.from_dict(history.history)
    df.to_csv(dirpath+'/history.csv')
    i = df.loc[:, c.monitor].argmax()

    for m in c.metrics + ['loss']:
        plot_metric(df, m, i, dirpath)

    return

def run_experiment(**kwargs):    
    model, params = get_model( kwargs['embeddings'] )
    hyperparams = fill_dict(params, kwargs)
    
    transforms = [lambda x:sequence.pad_sequences(x, MAX_CHORDS), lambda y:y]
    dm_train = DataManager(train, y_train, batch_size=c.batch_size, maxepochs=c.epochs+1, transforms=transforms)
    dm_valid = DataManager(valid, y_valid, batch_size=c.batch_size, maxepochs=100*c.epochs+1, transforms=transforms)
    
    with get_archiver(datadir='data/models') as a1, get_archiver(datadir='data/results') as a:

        with open(a.getFilePath('hyperparameters.json'), 'w') as f:
            json.dump(hyperparams, f, indent=2)

        with open(a.getFilePath('model.json'), 'w') as f:
            f.write(model.to_json(indent=2))

        stdout = sys.stdout
        with open(a.getFilePath('summary.txt'), 'w') as sys.stdout:
            model.summary()
        sys.stdout = stdout

        plot_model(model, to_file=a.getFilePath('model.png'), show_shapes=True, show_layer_names=True)

        earlystopping = EarlyStopping(monitor=c.monitor, patience=c.patience, verbose=0, mode=c.monitor_objective)
        modelpath = a1.getFilePath('weights.h5')
        csvlogger = CSVLogger(a.getFilePath('logger.csv'))
        modelcheckpoint = ModelCheckpoint(modelpath, monitor=c.monitor, save_best_only=True, verbose=0, mode=c.monitor_objective)
        logger.info('starting training')
        logger.info(str((dm_train.num_batches, dm_valid.num_batches)))
        h = model.fit_generator(generator=dm_train.batch_generator(), steps_per_epoch=dm_train.num_batches, epochs=c.epochs,
                        validation_data=dm_valid.batch_generator(), validation_steps=dm_valid.num_batches,
                        callbacks=[earlystopping, modelcheckpoint, csvlogger], class_weight=train_weights )
    
        save_history(h, a.getDirPath())

def main():
    commit_hash = save_code()
    embeddings_path = '/home/yg2482/code/chord2vec/data/chord2vec_199.npz'
    x_datapath='data/X.001.pickle'
    y_datapath='data/y.001.pickle'
    load_embeddings(embeddings_path=embeddings_path)
    load_data(x_datapath=x_datapath, y_datapath=y_datapath)
    run_experiment(**locals())

In [34]:
embeddings_path = '../data2/chord2vec_30hr.npz'
x_datapath='../data2/X.pickle0.001.pickle'
y_datapath='../data2/y.pickle0.001.pickle'


In [35]:
load_embeddings(embeddings_path=embeddings_path)

In [None]:
load_data('../data2/X.pickle', '../data2/y.pickle', filter_majority=True,
          train_params_path='../data2/train_params.npz',)

In [None]:
print len(y_train), len(train)
print len(y_valid), len(valid)
print len(y_test), len(test)


In [38]:
run_experiment(embeddings = '../data2/chord2vec_30hr.npz')

2017-05-05 01:33:13,028:INFO:__main__:MainThread:line 198: created a DataManager for batch_size: 256, maxepochs: 201, num_batches: 1
2017-05-05 01:33:13,029:INFO:__main__:MainThread:line 198: created a DataManager for batch_size: 256, maxepochs: 20001, num_batches: 1
2017-05-05 01:33:13,242:INFO:__main__:MainThread:line 250: starting training
2017-05-05 01:33:13,244:INFO:__main__:MainThread:line 251: (1, 1)
2017-05-05 01:33:13,563:INFO:util.archiver:MainThread:line 41: archived directory: /home/yg2482/code/music-styles/src/data/results/archive/20170505_013313.tar
2017-05-05 01:33:13,670:INFO:util.archiver:MainThread:line 41: archived directory: /home/yg2482/code/music-styles/src/data/models/archive/20170505_013313.tar


Epoch 1/200


ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 arrays but instead got the following list of 2 arrays: [array([ 0.,  0.,  1.]), array([ 0.,  0.,  1.])]...

In [6]:
t1[[3,7]]

TypeError: list indices must be integers, not list