# Generation de musique avec des LSTM

- installer le package music21 via conda/pip
- verifier que les imports suivant passent bien

In [1]:
import glob
import pickle
import numpy as np

from music21 import converter, instrument, note, chord, stream
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization as BatchNorm
from tensorflow.keras.utils import to_categorical
from tensorflow.python.keras import utils
from sklearn.preprocessing import OrdinalEncoder


- extrayons les notes de nos fichiers .mid (et sauvegardons le résultat, comme l'extraction peut prendre du temps)
- les fichiers .mid ne sont pas un fichier "audio" tel qu'un .mp3, mais davantage une partition, qui contiennent l'ensemble
des notes, melodies, rythme et instruments de la musique

In [2]:
%%time
notes = []

for file in glob.glob("data/*.mid"):
    try:
        midi = converter.parse(file)
    except:
        print("unable to read ", file)
        continue
    print("Parsing %s" % file)

    notes_to_parse = None

    try:  # file has instrument parts
        s2 = instrument.partitionByInstrument(midi)
        notes_to_parse = s2.parts[0].recurse()
    except:  # file has notes in a flat structure
        notes_to_parse = midi.flat.notes

    for element in notes_to_parse:
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))

with open('notes', 'wb') as filepath:
    pickle.dump(notes, filepath)


Parsing data/stella.mid
Parsing data/deadend.mid
Parsing data/bluesy2.mid
Parsing data/cherokee.mid
Parsing data/anotheru.mid
Parsing data/cherok2.mid
Parsing data/jazz07.mid
Parsing data/psr510_5.mid
Parsing data/yyz.mid
Parsing data/mechano.mid
Parsing data/misty.mid
Parsing data/hapbirth.mid
Parsing data/stthomas.mid
Parsing data/rndmdngt.mid
Parsing data/lazybird.mid
Parsing data/corea-2.mid
Parsing data/roadtrav.mid
Parsing data/dreamy2.mid
Parsing data/fransroc.mid
Parsing data/dolphns2.mid
Parsing data/chipblue.mid
Parsing data/don-lee.mid
Parsing data/gintonic.mid
Parsing data/jazz11.mid
Parsing data/worm.mid
Parsing data/011.mid
Parsing data/bigband.mid
Parsing data/yebisu.mid
Parsing data/onlylove.mid
Parsing data/knife.mid
Parsing data/jazz.mid
Parsing data/happybdy.mid
Parsing data/allblues.mid
Parsing data/pgunn.mid
Parsing data/gmblues1.mid
Parsing data/slanted.mid
Parsing data/alljarr.mid
Parsing data/opus1.mid
Parsing data/tarkus.mid
Parsing data/rainbow.mid
Parsing dat



Parsing data/newyork.mid




Parsing data/amazing.mid




Parsing data/dreamy.mid
Parsing data/wefreeki.mid
Parsing data/zanzibar.mid
Parsing data/nightuni.mid
Parsing data/almonds.mid
Parsing data/foregrou.mid
Parsing data/moaning.mid
Parsing data/ambiant.mid
Parsing data/metheny1.mid
Parsing data/someband.mid
Parsing data/shiny.mid
Parsing data/bounce.mid
Parsing data/favthngs.mid
Parsing data/dfreesax.mid
Parsing data/bop_on.mid
Parsing data/django.mid
Parsing data/prel-kis.mid
Parsing data/birdland.mid
Parsing data/lullaby.mid
Parsing data/fuzion.mid
Parsing data/atrain.mid
unable to read  data/spain.mid




unable to read  data/new_york.mid
Parsing data/minorsw.mid
Parsing data/teentown.mid
Parsing data/jobim.mid
Parsing data/ticotico.mid
Parsing data/bluset55.mid
Parsing data/suplex.mid
Parsing data/improv_1.mid
Parsing data/rnd_mid.mid
Parsing data/nuages.mid
Parsing data/improv_2.mid




Parsing data/4bros.mid
Parsing data/softly.mid
Parsing data/satindol.mid




Parsing data/crysslnc.mid
Parsing data/tribute.mid
Parsing data/dolphnst.mid
Parsing data/ipanema.mid




Parsing data/wivelove.mid
Parsing data/bethleem.mid




Parsing data/sthomas.mid
Parsing data/flyaway.mid




Parsing data/techno78.mid
Parsing data/gdvib6.mid
Parsing data/dgeneric.mid
Parsing data/autmnlev.mid
Parsing data/youdbe.mid
Parsing data/byt.mid




Parsing data/pop.mid
Parsing data/fordebby.mid




Parsing data/monk.mid
Parsing data/israel.mid
Parsing data/dontget.mid




Parsing data/corea-1.mid




Parsing data/onestep.mid
Parsing data/count.mid
Parsing data/ckick.mid
Parsing data/ppanth2.mid
Parsing data/dontmean.mid
Parsing data/stolen.mid
Parsing data/take5.mid




Parsing data/samba1n.mid
Parsing data/georgiam.mid
Parsing data/ppanth.mid




Parsing data/newhit.mid
CPU times: user 9min 35s, sys: 2.37 s, total: 9min 38s
Wall time: 9min 37s


In [3]:
def load_notes():
    notes = []

    with open('notes', 'rb') as filepath:
        notes = pickle.load(filepath)

    return notes


notes = load_notes()


- qu'y a-t-il dans nos notes ?
    - format lettre+chiffre : code une note (ex : A5). La suite de note forme la mélodie de la musique.
    - format chiffre.chiffre.chiffre : code un accord (l'harmonie) : lorsque plusieurs notes sont superposées au même moment.

In [4]:
print(notes)


['G2', 'G1', 'D2', 'G#2', 'E-3', 'G#1', 'G2', 'G1', 'G2', 'G1', 'C2', 'G2', 'C1', 'F2', 'F1', 'B1', 'B-1', 'B-0', 'B-1', 'B-0', 'E-2', 'E-1', 'B-1', 'B-1', 'B-0', 'B-1', 'B-0', 'E-2', 'E-1', 'F2', 'F#2', 'E-1', 'G2', 'G#2', 'E-1', 'A2', 'B-2', 'E-1', 'D3', 'E-1', 'D2', 'D2', 'D1', 'F2', 'F2', 'D1', 'G2', 'G1', 'D3', 'B2', 'G1', 'G2', 'G1', 'C3', 'C1', 'B-2', 'G2', 'C1', 'E-2', 'C2', 'C1', 'D2', 'E-2', 'C1', 'G2', 'B-1', 'B-0', 'C#1', 'F1', 'B-1', 'B-0', 'G3', 'E-1', 'F3', 'E-3', 'E-1', 'C#3', 'E-1', 'G#2', 'G#1', 'E-3', 'C3', 'G#1', 'G#2', 'G#1', 'F3', 'C#1', 'E-3', 'C#3', 'C#1', 'B2', 'C#1', 'G3', 'E-1', 'F3', 'E-3', 'E-1', 'D3', 'E-1', 'C2', 'C1', 'D2', 'E-2', 'C1', 'E-2', 'C1', 'F2', 'F1', 'A1', 'B-1', 'F1', 'B1', 'D2', 'C2', 'D2', 'F1', 'C3', 'A2', 'F1', 'F2', 'F1', 'F2', 'F1', 'G#1', 'C2', 'F2', 'F1', 'D3', 'B-0', 'C3', 'B-2', 'B-0', 'G#2', 'B-0', 'E-3', 'E-1', 'D3', 'B-2', 'E-1', 'G2', 'E-2', 'E-1', 'F2', 'G2', 'E-1', 'B-2', 'F3', 'D1', 'E-3', 'D3', 'D1', 'C3', 'D1', 'B3', 'G1', 

- quelle est la taille de notre vocabulaire (le nombre de symbole différent que l'on devra encoder) ?

In [5]:
# TODO
n_vocab = len(set(notes))
print(n_vocab)


1002


### Comment effectuer une tache de génération ?

Le réseau prend en entrée une suite de note (par exemple, 100 notes), et doit prédire la 101eme.
Il faut donc :
- parser notre fichier de note en sequences de 100 notes (elles peuvent s'overlap)
- associer un label pour cette sequence qui est la 101eme note.
    - ex : une entree est constituee des notes 0 à 100 et son label la note 101. L'entrée suivante des notes 1 à 101, son label la note 102, etc...
- convertir les notes en format utilisable par le reseau de neurones. Quel est le format adapté pour représenter des notes ?
    - onehot ou ordinal ?

- via l'encoder adapté de scikit, encodons les notes :

In [6]:
notes_scikit = np.array(notes).reshape(-1, 1)
enc = OrdinalEncoder()

notes_encoded = enc.fit_transform(notes_scikit)
enc.categories_


[array(['0', '0.1', '0.1.2', ..., 'G5', 'G6', 'G7'], dtype='<U23')]

In [7]:
notes_encoded


array([[996.],
       [995.],
       [953.],
       ...,
       [933.],
       [999.],
       [999.]])

- generer le jeu d'entrainement et les labels correspondant.
    - extraire des sequences de 100 notes comme entrées du reseau, et la note suivante comme label

In [8]:
network_input = []
network_output = []

# TODO
sequence_size = 100
for i in range(len(notes_encoded) - sequence_size):
    network_input.append([val[0] for val in notes_encoded[i:i+sequence_size]])
    network_output.append(notes_encoded[i+sequence_size][0])


- normalisons l'entrée entre 0 et 1, et mettons tous ça au format attendu par keras

In [9]:
n_patterns = len(network_input)
x_train = np.reshape(
    network_input, (n_patterns, sequence_size, 1)) / float(n_vocab)
y_train = to_categorical(network_output)


In [10]:
print(x_train[0])
print(y_train[0])


[[0.99401198]
 [0.99301397]
 [0.9510978 ]
 [0.98802395]
 [0.95908184]
 [0.98702595]
 [0.99401198]
 [0.99301397]
 [0.99401198]
 [0.99301397]
 [0.94211577]
 [0.99401198]
 [0.94111776]
 [0.97904192]
 [0.97804391]
 [0.92814371]
 [0.92115768]
 [0.92015968]
 [0.92115768]
 [0.92015968]
 [0.95808383]
 [0.95708583]
 [0.92115768]
 [0.92115768]
 [0.92015968]
 [0.92115768]
 [0.92015968]
 [0.95808383]
 [0.95708583]
 [0.97904192]
 [0.97205589]
 [0.95708583]
 [0.99401198]
 [0.98802395]
 [0.95708583]
 [0.91417166]
 [0.92215569]
 [0.95708583]
 [0.95209581]
 [0.95708583]
 [0.9510978 ]
 [0.9510978 ]
 [0.9500998 ]
 [0.97904192]
 [0.97904192]
 [0.9500998 ]
 [0.99401198]
 [0.99301397]
 [0.95209581]
 [0.92914172]
 [0.99301397]
 [0.99401198]
 [0.99301397]
 [0.94311377]
 [0.94111776]
 [0.92215569]
 [0.99401198]
 [0.94111776]
 [0.95808383]
 [0.94211577]
 [0.94111776]
 [0.9510978 ]
 [0.95808383]
 [0.94111776]
 [0.99401198]
 [0.92115768]
 [0.92015968]
 [0.93512974]
 [0.97804391]
 [0.92115768]
 [0.92015968]
 [0.99

In [11]:
n_patterns = len(network_input)
x_train = np.reshape(network_input, (n_patterns, 100, 1)) / float(n_vocab)
y_train = to_categorical(network_output)


In [12]:
print(x_train.shape)
print(y_train.shape)


(27769, 100, 1)
(27769, 1002)


## Creation du reseau et entrainement

- créer notre réseau recurrent
    - une ou plusieurs couches de lstm en entrée, suivit d'un perceptron pour prédire la note suivante

In [13]:
model = Sequential()

model.add(LSTM(
    512, input_shape=(x_train.shape[1], x_train.shape[2]),
    recurrent_dropout=.3
))
model.add(BatchNorm())
model.add(Dropout(.3))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(BatchNorm())
model.add(Dense(n_vocab))
model.add(Activation('softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')
# model.build(x_train.shape)


2022-01-07 13:16:26.491886: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-01-07 13:16:26.506558: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-07 13:16:26.533333: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [14]:
len(model.weights)
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 512)               1052672   
_________________________________________________________________
batch_normalization (BatchNo (None, 512)               2048      
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
activation (Activation)      (None, 256)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dense_1 (Dense)              (None, 1002)              2

- entrainer le réseau : attention, cela peut prendre du temps !

In [18]:
%%time
model.fit(x_train, y_train, epochs=5, batch_size=128)


Epoch 1/5
Epoch 2/5

### Prediction

- on tire une sequence au hasard pour initialiser notre prediction
    - on aurait aussi pu lui proposer une ou plusieurs notes aléatoires
- on le laisse compléter note par note

In [None]:
# pick a random sequence from the input as a starting point for the prediction
start = np.random.randint(0, len(network_input)-1)
current_input = network_input[start]
prediction_output = []

n = 250
for note_index in range(n):
    prediction_input = np.reshape(current_input, (1, len(current_input), 1))

    # effectue la prediction de la note suivante
    prediction_input = prediction_input / float(n_vocab)
    prediction = model.predict(prediction_input, verbose=0)
    pred = np.argmax(prediction)

    # converti la note prédite au format mid
    pred_note = enc.inverse_transform([[pred]])[0]

    # ajoute la note prédite à la fin de l'entrée, et décale celle-ci d'une note
    prediction_output.append(pred_note[0])
    current_input.append(pred)
    current_input = current_input[1:]


- on peut convertir la sortie au format midi pour l'écouter

In [None]:
def create_midi(prediction_output):
    """ convert the output from the prediction to notes and create a midi file
        from the notes """
    offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model
    for pattern in prediction_output:
        # pattern is a chord
        if ('.' in pattern) or pattern.isdigit():
            notes_in_chord = pattern.split('.')
            notes = []
            for current_note in notes_in_chord:
                new_note = note.Note(int(current_note))
                new_note.storedInstrument = instrument.Piano()
                notes.append(new_note)
            new_chord = chord.Chord(notes)
            new_chord.offset = offset
            output_notes.append(new_chord)
        # pattern is a note
        else:
            new_note = note.Note(pattern)
            new_note.offset = offset
            new_note.storedInstrument = instrument.Piano()
            output_notes.append(new_note)

        # increase offset each iteration so that notes do not stack
        offset += 0.5

    midi_stream = stream.Stream(output_notes)

    midi_stream.write('midi', fp='output.mid')


create_midi(prediction_output)


In [None]:
print(prediction_output)


['A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5

## Ca ne marche pas ? 
- augmentez la capacité du reseau
    - dans la litterature, jusqu'à 3 couches de lstm
- entrainez plus longtemps
    - dans la litterature, 200 epoques
    - vous devrez probablement le laisser tourner la nuit pour obtenir un résultat
- vous pouvez écouter des résultats de réseaux similaire 
    - https://www.rileynwong.com/blog/2019/2/25/generating-music-with-an-lstm-neural-network
    - https://becominghuman.ai/generating-music-using-lstm-neural-network-545f3ac57552