In [1]:
import os
import os.path
import pathlib
import glob
from pathlib import Path
import glob
import psutil
import math

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf
import tensorflow_io as tfio

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

# Set the seed value for experiment reproducibility.
seed = 4385789
tf.random.set_seed(seed)
np.random.seed(seed)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
data_dir = 'Z:/AutoMusic/output_3/'
cache_dir = 'Z:/AutoMusic/cache_3/'
test_file = data_dir+"/high_up/high_up.02-block_and_crown-remember_the_good_times_(_057672_064481.wav"
pioneer_files = 'd:/pioneer/usbanlz'
pioneer_prefix = 'D:'
temp_wav_file = 'Z:/AutoMusic/temp/temp.wav'
mytotalfiles = 934 # I know this from previous runs.
EPOCHS = 200 # using early stopping, but just in case

In [3]:
# Find the next history file number

historycounter = 1 # start looking at this number as history1.txt

Path("./Histories").mkdir(parents=True, exist_ok=True)

while os.path.isfile("./Histories/history"+str(historycounter)+".txt"):
    historycounter += 1

history_file = "./Histories/history"+str(historycounter)+".txt"
print("Using run history file: "+history_file)

checkpoint_filepath = 'Z:/AutoMusic/checkpoint/automusic'+str(historycounter)+'.h5'
print("Using model save/checkpoint file: "+checkpoint_filepath)

# Roughly figure out if the dataset will fit into memory, or we need to use disk caching

free_mem = psutil.virtual_memory()
free_mem = math.floor(free_mem.available/1024/1024/1024*0.90)

samples_size = sum(f.stat().st_size for f in Path(data_dir).glob('**/*') if f.is_file())
samples_size = math.ceil((samples_size/1024/1024/1024)*1.05)

if samples_size < free_mem:
    print("Sample size of "+str(samples_size)+"GB should fit in free memory of "+str(free_mem)+"GB - using RAM to cache")
    cache_dir = ''
else:
    print("Sample size of "+str(samples_size)+"GB will not fit in free memory of "+str(int(free_mem))+"GB - using cache dir "+cache_dir)

Using run history file: ./Histories/history39.txt
Using model save/checkpoint file: Z:/AutoMusic/checkpoint/automusic39.h5
Sample size of 65GB should fit in free memory of 94GB - using RAM to cache


In [4]:
commands = np.array(tf.io.gfile.listdir(str(data_dir)))
commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
print('Directories/Labels:', commands)

Directories/Labels: ['high_chorus' 'high_down' 'high_intro' 'high_outro' 'high_up']


In [5]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=32,
    validation_split=0.2,
    seed=0,
    output_sequence_length=44100*3,
    subset='both')

label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)


Found 123264 files belonging to 5 classes.
Using 98612 files for training.
Using 24652 files for validation.

label names: ['high_chorus' 'high_down' 'high_intro' 'high_outro' 'high_up']


In [6]:
def squeeze(audio, labels):
  # audio = tf.squeeze(audio, axis=-1)
  audio = audio[:,:,-1]
  return audio, labels

train_ds = train_ds.map(squeeze, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, num_parallel_calls=tf.data.AUTOTUNE)

In [7]:
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)

In [8]:
for example_audio, example_labels in train_ds.take(1):  
  print(example_audio.shape)
  print(example_labels.shape)

(32, 132300)
(32,)


In [9]:
class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2 + 1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.

        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.

        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            I think this is a function in the TF supports now?
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

In [10]:
input_shape = example_audio.shape[1:]
input_length = len(input_shape)
num_labels = len(label_names)

frame_length=1024
frame_step=64 # 128
fft_length=None
sample_rate=44100
duration=3
num_mel_channels=80
freq_min=40 # 200
freq_max=10000 # 8000
    
def ConvModel(n_classes=num_labels, sample_rate=sample_rate, duration=duration,
              fft_size=frame_length, hop_size=frame_step, n_mels=num_mel_channels):
    
    n_samples = sample_rate * duration
    
    # Accept raw audio data as input
    x = layers.Input(shape=(n_samples,), name='input', dtype='float32')
    y = LogMelSpectrogram(sample_rate, fft_size, hop_size, n_mels, freq_min, freq_max)(x)
    y = layers.Resizing(160,80)(y)
    y = layers.BatchNormalization(axis=2)(y)
    y = layers.GaussianNoise(1.5)(y)
    y = layers.Conv2D(32, 3, activation='relu')(y)
    y = layers.Conv2D(64, 3, activation='relu')(y)
    y = layers.MaxPooling2D()(y)
    y = layers.SpatialDropout2D(0.25)(y)
    y = layers.Flatten()(y)
    y = layers.Dense(128, activation='relu')(y)
    y = layers.GaussianDropout(0.5)(y)
    y = layers.Dense(num_labels)(y)
    
    return tf.keras.Model(inputs=x, outputs=y)

model = ConvModel()

model.summary()

learning_rate = 0.001 # 0.0001

model.compile(
    optimizer=tf.keras.optimizers.Adam(), # learning_rate=learning_rate
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 132300)]          0         
                                                                 
 log_mel_spectrogram (LogMel  (None, 2052, 80, 1)      0         
 Spectrogram)                                                    
                                                                 
 resizing (Resizing)         (None, 160, 80, 1)        0         
                                                                 
 batch_normalization (BatchN  (None, 160, 80, 1)       320       
 ormalization)                                                   
                                                                 
 gaussian_noise (GaussianNoi  (None, 160, 80, 1)       0         
 se)                                                             
                                                             

In [11]:
if cache_dir != '':
    for f in Path(cache_dir).glob('*'):
        try:
            print("Removing cache file "+str(f))
            f.unlink()
        except OSError as e:
            print("Error: %s : %s" % (f, e.strerror))
    
train_ds = train_ds.cache(cache_dir).shuffle(buffer_size=1000, reshuffle_each_iteration=True).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache(cache_dir).prefetch(tf.data.AUTOTUNE)
test_ds_precache = test_ds
test_ds = test_ds.cache(cache_dir).prefetch(tf.data.AUTOTUNE)

In [12]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1)

earlystop_callback = tf.keras.callbacks.EarlyStopping(
    verbose=1,
    patience=10,
    min_delta=0.003,
    restore_best_weights=True)

# This may be a bad idea:
# we're still retraining on ALL data, but this seems to speed up the epochs needed:
# model.load_weights(checkpoint_filepath) # kick off training with the last run's weights

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[earlystop_callback,checkpoint_callback]
)

# shouldn't be needed with restore_best_weights=True
# model.load_weights(checkpoint_filepath) # load the best saved weights... even if they aren't from this run, maybe?

Epoch 1/200
Epoch 1: val_accuracy improved from -inf to 0.40885, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 2/200
Epoch 2: val_accuracy improved from 0.40885 to 0.44326, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 3/200
Epoch 3: val_accuracy improved from 0.44326 to 0.44911, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 4/200
Epoch 4: val_accuracy improved from 0.44911 to 0.49310, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 5/200
Epoch 5: val_accuracy improved from 0.49310 to 0.53369, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 6/200
Epoch 6: val_accuracy improved from 0.53369 to 0.57532, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 7/200
Epoch 7: val_accuracy improved from 0.57532 to 0.61485, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 8/200
Epoch 8: val_accuracy improved from 0.61485 to 0.64140, saving model to Z:/AutoMusic/checkpoint\automusic39.h5
Epoch 9/200
Epoch 9


KeyboardInterrupt



In [None]:
metrics = history.history
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')

plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
plt.legend(['accuracy', 'val_accuracy'])
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')

In [None]:
y_true = tf.concat(list(test_ds_precache.map(lambda s,lab: lab)), axis=0)
y_pred = model.predict(test_ds_precache)
y_pred = y_pred.argmax(axis=1)

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx,
            xticklabels=label_names,
            yticklabels=label_names,
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.title('AutoMusic - Electronic Music Phrase Classifier')
plt.show()

In [None]:
x = test_file
x = tf.io.read_file(str(x))
x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=44100*3)
x = tf.squeeze(x, axis=-1)
x = x[tf.newaxis,...]
print(x.shape)

prediction = model(x)
plt.bar(label_names, tf.nn.softmax(prediction[0]))
plt.show()

In [None]:
import pyrekordbox
from pyrekordbox.anlz import AnlzFile
from pydub import AudioSegment
from pydub.generators import WhiteNoise
from pydub.effects import speedup, normalize
import hashlib
import os
import random
from pathlib import Path
import logging
l = logging.getLogger("pydub.converter")
l.setLevel(logging.CRITICAL)

myfiles = pyrekordbox.anlz.walk_anlz_paths(pioneer_files)

myfilecounter = 0

with open(history_file,'w') as out:

    for myfoundfile in myfiles:

        try: mydat = AnlzFile.parse_file(myfoundfile[1]['DAT'])
        except: continue
        try: myext = AnlzFile.parse_file(myfoundfile[1]['EXT'])
        except: continue

        mymp3 = mydat.get('PPTH')
        mymp3 = pioneer_prefix + mymp3

        if "Ultimate" in str(mymp3):
            # this is very specific to my library where there's some weird files full of random loops
            continue

        # print(mymp3)

        mylabels = {}
        mylabels['high'] = ['unknown', 'intro', 'up', 'down', 'unknown', 'chorus', 'outro', 'unknown', 'unknown', 'unknown', 'unknown']
        mylabels['mid']  = ['unknown', 'intro', 'verse', 'verse', 'verse', 'verse', 'verse', 'verse', 'bridge', 'chorus', 'outro']
        mylabels['low']  = mylabels['mid']

        try: mytimecode = mydat.get('PQTZ')[2]
        except: continue
        try: mystructures = myext.get('PSSI').entries
        except: continue
        try: mymood = myext.get('PSSI').mood
        except: continue

        if (mymood != 1):
            continue
        else:
            mymood = "high"

        mytimecode = mydat.get('PQTZ')[2]

        myfilelastbeat = myext.get('PSSI').end_beat

        mysimplestructure = {}

        myfilecounter += 1

        for x in range(len(mystructures)):

            mylabel = mystructures[x].kind
            mylabel = mymood+"_"+mylabels[mymood][mylabel]

            myfirstbeat = mystructures[x].beat

            if (x+1<len(mystructures)):
                mylastbeat = mystructures[x+1].beat
            else:
                mylastbeat = myfilelastbeat

            if (mylastbeat+1<len(mytimecode)):
                mylastbeat = mylastbeat
            else:
                mylastbeat = len(mytimecode)-1

            mysegmentbars = (mylastbeat-myfirstbeat)//8
            mysegmentleftovers = (mylastbeat-myfirstbeat)%8

            myfirsttimecode = mytimecode[int(myfirstbeat)]*1000
            mylasttimecode = mytimecode[int(mylastbeat)]*1000

            mysimplestructure[int(myfirsttimecode)] = mylabel
            mysimplestructure[int(mylasttimecode-1)] = mylabel

        mysong = AudioSegment.from_file(mymp3,frame_rate=44100)
        mono_audios = mysong.split_to_mono() 
        mysongmono = mono_audios[0]
        # chomp output with a random start time of 0..2999ms into the file, to give some variations.
        myrandomoffset = random.randint(0,2999)
        # mysongmono = mysongmono.normalize()

        mysongmono[myrandomoffset:].export(temp_wav_file,format="wav")

        x = tf.io.read_file(temp_wav_file)
        x, sample_rate = tf.audio.decode_wav(x, desired_channels=1)
        x = tf.squeeze(x, axis=-1)
        waveform = x

        slices = int(waveform.shape[0] / (44100*3))
        samples = tf.split(waveform[: slices * (44100*3)], slices)

        milliseconds = 0
        right = 0
        wrong = 0
        transitions = 0

        mylabelsseen = []
        fileview = "# "

        currentwrongs = 0

        for sample in samples:

            x = sample[tf.newaxis,...]

            prediction = model(x)

            # plt.bar(label_names, tf.nn.softmax(prediction[0]))
            # plt.show()

            # correct label for random offset above, because file may be skewed 1..2999ms ahead
            #
            res = mysimplestructure.get(milliseconds) or mysimplestructure[
                  min(mysimplestructure.keys(), key = lambda key: abs(key-milliseconds-myrandomoffset))]

            res2 = mysimplestructure.get(milliseconds) or mysimplestructure[
                   min(mysimplestructure.keys(), key = lambda key: abs(key-milliseconds-myrandomoffset+3000))]

            myaactualendlabel = str(res2)

            mypredictedlabel = str(label_names[np.argmax(prediction)])
            myactuallabel = str(res)

            if myactuallabel != myaactualendlabel :

                # print(" ~~~ "+mypredictedlabel+" in transition from "+myactuallabel+" to "+myaactualendlabel
                print("~", end="")
                print("~", end="",file=out)
                transitions += 1
                currentwrongs = 0

            if mypredictedlabel == myactuallabel :

                print("+", end="")
                print("+", end="",file=out)
                right += 1
                currentwrongs = 0

            else:

                wrong += 1
                myrandom = random.randint(0,9)

                if (myactuallabel not in mylabelsseen or currentwrongs > 0 or myrandom == 0): # 10% chance of random sampling

                    currentwrongs += 1

                    if milliseconds+3001 < len(mysong):

                        mylabelsseen.append(myactuallabel)
                        mylabeldir = myactuallabel

                        myoutputbasefile = myactuallabel+"."+Path(mymp3).stem+"_"+str(int(milliseconds)-myrandomoffset).rjust(6,'0')+"_"+str(int(milliseconds+3000)-myrandomoffset).rjust(6,'0')

                        # myspeedup = mysong[milliseconds:milliseconds+3000].speedup(1.03)
                        # mynoise = WhiteNoise().to_audio_segment(duration=len(myspeedup)).apply_gain(-20)
                        # myspeedup_noise = myspeedup.overlay(mynoise)

                        mytempsong = mysong[milliseconds:milliseconds+3000]
                        # mynoise = WhiteNoise().to_audio_segment(duration=len(mytempsong)).apply_gain(-20)
                        # mytempsong_noise = mytempsong.overlay(mynoise)

                        mytempsong.export(data_dir+"/"+mylabeldir+"/"+myoutputbasefile+"_FIXES.wav", format="wav")
                        # myspeedup.export(data_dir+"/"+mylabeldir+"/"+myoutputbasefile+"_speed_FIXES_N.wav", format="wav")
                        # myspeedup_noise.export(data_dir+"/"+mylabeldir+"/"+myoutputbasefile+"_speed_noise_FIXES_N.wav", format="wav")
                        # mytempsong_noise.export(data_dir+"/"+mylabeldir+"/"+myoutputbasefile+"_noise_FIXES_N.wav", format="wav")

                        if myrandom == 0:
                            print("R", end="")
                            print("R", end="",file=out)
                            currentwrongs = 0 # we only want one random sample, just to spice things up.
                            # this also functions as a "stop writing sequence of samples early" randomness.
                        else:
                            print("W", end="")
                            print("W", end="",file=out)

                        if currentwrongs > 5: # stop over-feeding a LOT of wrong samples to the model next time. Let myrandom take care of this.
                            currentwrongs = 0

                    else:
                        print("!", end="")
                        print("!", end="",file=out)
                else:
                    print("!", end="")
                    print("!", end="",file=out)

            milliseconds += 3000

        print(" ")
        print(" ",file=out)
        print(str(int(right/(right+wrong)*100))+"% correct - "+str(Path(mymp3).name)+" ("+str(myfilecounter)+" of "+str(mytotalfiles)+")")
        print(str(int(right/(right+wrong)*100))+"% correct - "+str(Path(mymp3).name)+" ("+str(myfilecounter)+" of "+str(mytotalfiles)+")",file=out)
        print(" ")
        print(" ",file=out)

In [None]:
counter = 32
histories = []

while os.path.isfile("./Histories/history"+str(counter)+".txt"):
    
    print("found history "+str(counter))
    histories.append(open("./Histories/history"+str(counter)+".txt",'r').readlines())
    counter += 1

best = 0

with open('./Histories/history_all.txt','w') as out:
    
    for z in range(len(histories[0])):
        
        for x in range(len(histories)):
            if histories[x][z][0].isdigit():
                testbest = int(histories[x][z].rstrip().partition("%")[0])
                if testbest > best:
                    best = testbest
        
        for x in range(len(histories)):
            if histories[x][z].rstrip() == "":
                print("",file=out)
                break
            else:
                if histories[x][z][0].isdigit():
                    testbest = int(histories[x][z].rstrip().partition("%")[0])
                    if best == testbest:
                        print("> ",file=out,end='')
                    else:
                        print("  ",file=out,end='')
                print(histories[x][z].rstrip(),file=out)
                
        best = 0
