# Data Preparation

In [81]:
import tensorflow as tf
import os
import csv
import ast

# Preprocess images
def preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=1)
    img = tf.image.resize(img, (528, 528))  # Resize the image
    img = tf.cast(img, tf.float32) / 127.5 - 1.0  # Normalize to [-1, 1] range
    img = tf.image.grayscale_to_rgb(img)  # Convert grayscale to RGB by duplicating the channel
    return img

# Define the function to encode pitch and duration into a single integer
def encode_labels(pitch, duration, num_durations=5):
    return pitch * num_durations + (duration - 17)


# Generate datasets
def generate_datasets(image_folder, label_file, batch_size=32):
    image_paths = []
    encoded_labels = []

    with open(label_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header
        for row in reader:
            filename = row[0]
            labels = ast.literal_eval(row[1])  # Parse the string to get the list of tuples

            # Encode each (pitch, duration) pair into a single integer
            labels_encoded = [encode_labels(p, d - 17) for p, d in labels]

            # Add to lists
            image_paths.append(os.path.join(image_folder, filename))
            encoded_labels.append(labels_encoded)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, encoded_labels))

    # Apply preprocessing to images and pack image and label together
    dataset = dataset.map(lambda x, y: ((preprocess_image(x), y), y))

    # Batching and prefetching
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset


In [48]:
# Generate the dataset
pitch_dataset = generate_datasets('../raw_data/sheet_images', '../raw_data/labels.csv', batch_size=32)

# Model Architecture

In [49]:
from tensorflow.keras.applications import EfficientNetB6
from tensorflow.keras.layers import Reshape, Input
from tensorflow.keras.models import Model
import tensorflow as tf

# CNN model (feature extractor)
def get_cnn_model():
    base_model = EfficientNetB6(
        input_shape=(528, 528, 3),
        include_top=False,
        weights="imagenet"
    )
    base_model.trainable = False  # Freeze the model to use it as a feature extractor

    # Reshape the output to prepare it for the Transformer Encoder
    base_model_out = base_model.output
    base_model_out = Reshape((-1, base_model_out.shape[-1]))(base_model_out)

    cnn_model = Model(base_model.input, base_model_out)
    return cnn_model

In [82]:
from tensorflow.keras.layers import Layer, MultiHeadAttention, Dense, LayerNormalization, Embedding, Dropout
import tensorflow as tf

class TransformerEncoderBlock(Layer):
    def __init__(self, embed_dim=2304, dense_dim=512, num_heads=8):
        super().__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs, training=False):
        attention_output = self.attention(inputs, inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

class TransformerDecoderBlock(Layer):
    def __init__(self, embed_dim=2304, ff_dim=512, num_heads=8, vocab_size=85):
        super().__init__()
        self.attention_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.dropout_1 = Dropout(0.3)
        self.dropout_2 = Dropout(0.5)
        self.out = Dense(vocab_size, activation="softmax")

    def call(self, inputs, encoder_outputs, training=False):
        inputs = self.embedding(inputs)
        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs)
        out_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(query=out_1, value=encoder_outputs, key=encoder_outputs)
        out_2 = self.layernorm_2(out_1 + attention_output_2)
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_1(ffn_output, training=training)
        return self.out(ffn_output)

class MusicGenerationModel(tf.keras.Model):
    def __init__(self, cnn_model):
        super().__init__()
        self.cnn_model = cnn_model
        self.encoder = TransformerEncoderBlock(embed_dim=2304, dense_dim=512, num_heads=8)
        self.decoder = TransformerDecoderBlock(embed_dim=2304, ff_dim=512, num_heads=8, vocab_size=85)

    def call(self, inputs, training=False):
        image, target = inputs
        cnn_features = self.cnn_model(image, training=training)
        encoded_features = self.encoder(cnn_features, training=training)
        output = self.decoder(target, encoded_features, training=training)
        return output


In [83]:
cnn_model = get_cnn_model()
music_model = MusicGenerationModel(cnn_model)

# Model Training

In [84]:
# Load the dataset
pitch_dataset = generate_datasets('../raw_data/sheet_images', '../raw_data/labels.csv')

# Define the CNN model
cnn_model = get_cnn_model()

# Define the MusicGenerationModel
music_model = MusicGenerationModel(cnn_model)

# Compile the model
music_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
music_model.fit(pitch_dataset, epochs=25, validation_data=None)  # Add validation data if available


Epoch 1/25


2024-08-15 21:49:03.431472: W tensorflow/core/framework/op_kernel.cc:1840] OP_REQUIRES failed at sparse_xent_op.cc:103 : INVALID_ARGUMENT: Received a label value of -17 which is outside the valid range of [0, 85).  Label values: 56 5 59 -3 42 -10 -16 32 -13 11 50 -3 -1 66 32 23 -2 17 -6 7 -10 22 12 -17 -11 66 6 60 35 25 55 -8 11 47 35 25 59 -13 -4 41 54 51 67 8 17 -14 66 15 -4 2 46 1 62 32 10 -15 11 65 54 27 -3 60 66 0 41 -9 7 2 51 36 6 65 5 27 40 52 42 22 65 60 -13 67 -10 -3 47 39 9 -14 36 -9 16 -6 -13 -9 -6 -9 11 22 1 44 40 31 9 61 26 46 27 37 20 30 16 51 45 51 66 16 -8 17 20 15 10 59 25 7 -1 20 26 12 61 35 62 -16 -3 -12 67 45 6 59 12 44 57 27 35 56 -2 44 12 -14 27 -4 24 -5 10 24 39 63 60 52 10 51 22 27 32 -8 22 34 32 -15 26 17 27 29 25 6 22 -7 42 56 4 52 1 56 21 62 -13 -8 19 51 57 22 -3 26 29 -4 56 18 -4 57 24 61 46 -8 50 66 21 65 32 6 42 -13 67 27 6 21 44 62 -7 49 66 -3 62 15 46 15 39 56 -13 62 65 5 35 56 49 -8 66 60 37 54 47 0 26 57 8 -3 -14 49 -10 32 -14 -4 50 22 36 -15 17 27 29 

InvalidArgumentError: Graph execution error:

Detected at node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits defined at (most recent call last):
  File "/Users/ninjamac/.pyenv/versions/3.10.6/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/Users/ninjamac/.pyenv/versions/3.10.6/lib/python3.10/runpy.py", line 86, in _run_code

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Users/ninjamac/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 600, in run_forever

  File "/Users/ninjamac/.pyenv/versions/3.10.6/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once

  File "/Users/ninjamac/.pyenv/versions/3.10.6/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/bh/93g2q2fx3654lfb40kf_tyr40000gn/T/ipykernel_19152/1454241817.py", line 14, in <module>

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 318, in fit

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 54, in train_step

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/trainers/trainer.py", line 357, in _compute_loss

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/trainers/trainer.py", line 325, in compute_loss

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/trainers/compile_utils.py", line 609, in __call__

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/trainers/compile_utils.py", line 645, in call

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/losses/loss.py", line 43, in __call__

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/losses/losses.py", line 27, in call

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/losses/losses.py", line 1853, in sparse_categorical_crossentropy

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/ops/nn.py", line 1567, in sparse_categorical_crossentropy

  File "/Users/ninjamac/.pyenv/versions/3.10.6/envs/Consonance/lib/python3.10/site-packages/keras/src/backend/tensorflow/nn.py", line 645, in sparse_categorical_crossentropy

Received a label value of -17 which is outside the valid range of [0, 85).  Label values: 56 5 59 -3 42 -10 -16 32 -13 11 50 -3 -1 66 32 23 -2 17 -6 7 -10 22 12 -17 -11 66 6 60 35 25 55 -8 11 47 35 25 59 -13 -4 41 54 51 67 8 17 -14 66 15 -4 2 46 1 62 32 10 -15 11 65 54 27 -3 60 66 0 41 -9 7 2 51 36 6 65 5 27 40 52 42 22 65 60 -13 67 -10 -3 47 39 9 -14 36 -9 16 -6 -13 -9 -6 -9 11 22 1 44 40 31 9 61 26 46 27 37 20 30 16 51 45 51 66 16 -8 17 20 15 10 59 25 7 -1 20 26 12 61 35 62 -16 -3 -12 67 45 6 59 12 44 57 27 35 56 -2 44 12 -14 27 -4 24 -5 10 24 39 63 60 52 10 51 22 27 32 -8 22 34 32 -15 26 17 27 29 25 6 22 -7 42 56 4 52 1 56 21 62 -13 -8 19 51 57 22 -3 26 29 -4 56 18 -4 57 24 61 46 -8 50 66 21 65 32 6 42 -13 67 27 6 21 44 62 -7 49 66 -3 62 15 46 15 39 56 -13 62 65 5 35 56 49 -8 66 60 37 54 47 0 26 57 8 -3 -14 49 -10 32 -14 -4 50 22 36 -15 17 27 29 41 55 37 16 44 17 65 -13 1 -16 7 21 66 27 41 -10 21 -5 21 26 48 20 -5 42 6 40 52 34 30 40 -5 25 1 31 65 -15 7 20 2 6 55 60 -11 -5 25 14 51 24 51 16 32 27 19 30 0 40 -9 44 62 37 -15 29 60 -7 67 6 59 6 1 42 -3 -1 60 36 -3 52 -1 -3 -8 40 47 4 35 61 -14 67 15 9 7 62 52 -7 5 47 -14 -4 7 -9 6 12 2 6 -10 0 22 35 57 4 16 11 52 27 66 3 60 -1 -5 26 -8 15 -3 -1 49 56 -5 41 -9 55 51 31 26 12 1 37 50 47 25 35 26 12 58 -6 27 20 32 -14 7 9 50 -3 27 17 51 61 5 42 20 1 57 -8 5 30 15 -9 37 30 42 -16 62 42 32 58 20 17 26 54 22 31 2 31 17 54 37 -13 67 30 9 27 52 62 54 -8 17 0 4 47 -3 51 45 -13 65 10 31 12 19 35 5 24 5 61 51 -14 -3 1 9 41 -8 15 1 6 60 47 42 22 42 64 31 5 12 62 -8 66 55 41 -9 45 -3 43 60 -3 64 -4 12 17 25 -6 16 27 38 26 37 25 60 17 40 10 24 50 64 1 67 50 17 23 51 64 45 22 67 42 67 5 -5 62 26 -9 57 66 -14 -3 25 65 51 62 13 -4 60 25 46 27 7 17 27 33 25 42 -13 29 11 -15 60 24 1 40 -11 -13 -8 44 27 31 51 6 62 41 -9 45 21 37 -5 42 -15 55 35 31 22 62 -15 47 64 27 -13 52 56 35 57 42 30 67 42 -13 7 51 -16 59 67 55 -13 -8 37 -5 64 26 21 -12 52 -1 10 36 -3 60 41 16 -10 -4 42 2 4 11 60 -14 13 3 64 25 5 66 37 55 -3 11 5 11 -17 64 49 17 30 66 57 -10 36 51 -13 34 25 16 12 24 12 51 26 36 67 -10 4 -3 7 61 14 67 61 20 -13 38 9 36 -3 16 7 17 -13 3 -2 54 30 -10 16 19 66 -15 7 39 17 30 -14 31 -11 1 37 41 -8 -13 50 46 -14 25 66 -8 -3 -13 26 57 -8 21 -6 37 24 61 41 1 52 47 57 6 60 49 62 0 -3 15 37 51 6 32 10 -1 56 12 27 44 2 27 36 25 -14 30 45 55 66 -14 16 37 67 14 60 65 13 48 22 61 19 15 37 43 15 39 7 37 66 56 66 10 21 11 16 27 47 2 24 41 52 55 -17 33 0 31 9 32 42 21 -10 52 27 21 17 -15 47 1 -14 25 16 -14 65 12 31 16 25 47 55 42 12 60 67 -15 16 -9 21 67 -9 7 22 -5 59 32 66 47 9 5 -6 17 -15 7 -14 20 10 6 22 2 65 2 65 24 62 42 22 57 45 34 6 42 16 66 47 19 57 22 -3 35 -4 55 20 41 57 35 0 17 11 32 17 11 -6 12 61 7 2 27 7 -13 46 29 16 22 -8 57 7 34 56 35 16 26 7 37 -11 55 14 46 -3 36 7 1 -11 42 50 65 -6 41 -3 -13 25 52 -13 15 36 52 17 46 36 51 9 1 46 -1 57 0 36 62 50 67 56 34 17 25 5 30 61 31 -16 0 35 66 -6 52 35 22 -5 20 -4 62 32 10 16 24 55 -8 37 -8 11 2 39 61 37 42 -16 30 20 33 13 -12 67 19 40 57 11 -1 32 26 -13 66 51 9 41 10 52 -3 51 -3 26 35 15 52 61 20 1 59 1 22 0 60 20 11 32 1 25 21 47 7 11 36 57 37 9 45 27 7 -8 -3 61 19 -15 1 4 40 35 29 -11 7 44 -10 42 -8 12 67 -4 1 50 -13 20 6 26 31 7 64 -8 17 12 -9 61 19 32 -8 67 -8 14 2 14 52 6 -15 13 53 35 45 -9 26 11 1 38 0 9 40 -11 14 25 0 64 46 29 46 -10 13 59 41 12 0 -13 30 21 37 1 22 67 6 46 2 35 7 67 26 21 50 17 22 52 57 -11 7 11 6 12 -17 44 56 10 1 11 7 25 30 37 31 -14 39 22 56 -3 11 21 67 9 61 11 -13 -9 48 39 17 41 20 47 7 65 60 41 52 67 -9 42 0 56 34 27 -8 29 16 7 21 56 12 61 66 54 62 42 67 52 41 52 10 14 57 51 29 -5 -14 -8 41 49 55 2 39 19 -14 45 59 56 0 -6 12 16 27 36 17 41 19 46 67 23 33 57 49 32 67 41 3 -13 -10 -3 52 0 12 55 -2 48 9 44 51 19 11 -4 47 -3 51 -16 47 56 47 6 -14 -4 17 30 45 22 56 18 -4 31 57 26 16 -10 62 2 52 60 1 49 41 67 60 -13 32 7 10 50 14 20 27 11 27 45 39 16 -4 66 35 14 -13 32 48 20 44 21 41 26 19 55 66 51 34 26 21 16 23 42 1 34 -3 15 45 -11 31 2 7 17 -15 10 32 65 62 27 -4 29 15 -3 -8 54 24 13 24 56 20 -14 19 25 61 51 22 -5 64 42 -8 57 7 54 45 16 -13 19 -16 46 51 25 10 25 14 -1 -16 2 -9 66 -14 37 0 16 20 27 31 52 37 -13 31 -15 47 -11 30 -9 -3 -10 26 52 -14 36 30 62 40 19 52 41 22 32 16 -16 10 2 39 -1 30 -15 34 -3 64 35 46 -13 48 0 -11 52 56 62 66 62 -9 50 16 26 32 7 57 60 42 50 27 42 47 55 48 15 36 30 -4 55 18 10 67 37 -10 26 51 -14 1 61 19 35 53 31 51 27 1 47 15 5 -7 8 -14 37 24 17 31 27 57 37 50 5 -4 12 -14 7 47 -15 6 16 -13 0 -14 17 -12 50 35 30 12 46 -13 44 30 67 -8 6 25 11 62 -13 39 49 36 45 32 2 31 7 15 19 12 23 -12 33 66 29 57 1 47 1 37 17 44 22 57 45 52 -16 17 60 19 -15 65 -10 51 7 47 30 -14 51 -3 9 41 -3 -14 41 21 17 39 65 32 67 30 -5 20 52 62 12 59 2 -14 26 21 67 26 9 27 55 2 17 44 2 21 47 57 17 37 67 62 41 36 -8 59 32 59 26 42 66 61 -13 0 -11 36 57 -8 65 32 4 61 67 20 24 10 -7 27 -11 40 62 -14 29 -3 36 6 26 47 4 50 61 12 7 -14 57 -16 45 7 -16 9 -7 39 4 -5 47 -16 17 21 8 47 5 56 34 12 5 22 6 -15 37 57 16 57 66 4 -5 26 -9 36 11 2 34 -3 67 34 56 45 62 -12 28 50 67 39 -8 66 12 56 39 27 52 66 12 41 64 63 55 46 51 10 2 21 62 5 45 29 64 60 52 -9 7 0 -5 -11 54 62 30 47 56 21 -6 26 6 16 -15 55 44 56 -1 66 21 36 63 24 36 56 -5 20 -10 31 2 62 6 27 2 51 14 31 50 37 50 67 -14 25 16 62 57 49 39 60 -16 -9 7 27 50 -14 56 -13 10 61 37 14 50 20 30 22 61 27 41 -8 51 41 67 10 49 45 61 25 61 17 65 16 -3 50 -4 64 7 12 15 0 14 35 -8 64 16 32 18 -4 26 -5 30 65 19 65 -5 -1 65 0 16 27 62 19 32 17 -9 29 14 39 64 3 17 -9 42 34 -5 28 57 36 -9 -13 15 31 20 -7 52 27 -3 -8 51 25 -5 51 -3 50 12 50 62 2 6 52 36 1 64 51 17 11 17 58 18 54 22 7 -4 67 11 42 63 29 52 41 -4 61 67 -5 30 -13 36 26 66 52 16 39 61 -4 32 42 31 27 2 -9 56 67 31 11 17 31 -9 19 5 51 1 21 46 -11 42 27 31 45 67 -4 0 16 37 23 28 -8 32 7 12 -8 4 12 52 7 39 19 8 -8 39 31 55 -3 52 6 35 24 12 -10 -15 2 -9 11 -8 41 10 -16 56 12 2 50 32 10 56 -9 21 32 -7 58 54 22 -14 26 -4 12 -2 31 20 -11 62 67 52 -5 15 66 11 42 52 67 25 21 5 -10 21 60 -16 6 59 -5 16 10 -3 25 55 41 17 63 41 -13 37 25 -6 50 67 54 62 31 11 -3 39 10 2 12 25 -13 56 -10 -13 11 52 41 35 52 39 12 -12 0 25 37 47 20 -4 66 6 16 -3 2 16 -13 20 67 59 21 -3 36 -14 22 -1 36 51 26 66 -6 34 61 2 22 64 -10 49 21 61 -14 32 52 40 19 31 -3 22 44 -15 47 56 17 64 41 61 16 57 17 11 20 54 2 7 19 -8 25 16 -13 14 -3 17 -13 1 21 -13 44 29 55 47 41 -10 47 60 39 55 65 45 60 27 -15 -3 41 -9 16 0 59 -16 9 -2 -5 41 -4 21 57 0 32 63 42 67 -4 26 36 -8 20 -10 -13 55 -13 32 -8 -16 24 -12 46 39 45 26 66 36 -6 25 45 59 32 61 7 2 35 2 32 26 5 -14 47 64 36 20 -13 62 6 1 27 42 16 12 50 -4 57 29 21 35 12 42 35 30 20 17 2 -13 42 37 4 17 -13 56 -8 17 2 9 45 52 6 12 -7 2 20 50 36 30 37 48 -5 2 40 36 52 7 36 12 -2 -6 32 16 25 57 12 19 12 35 46 19 41 -5 62 57 45 62 54 47 26 -16 51 -8 45 32 -12 63 61 30 66 0 6 36 52 0 -16 -3 31 -2 51 67 31 44 17 12 -13 36 15 7 40 67 27 21 2 19 -11 58 30 16 56 34 40 12 63 39 34 54 15 25 46 -9 0 62 7 26 60 -7 34 -15 67 46 32 43 42 49 61 -9 57 42 17 33 13 35 49 41 47 -8 4 64 44 34 50 64 -5 24 19 43 55 67 30 47 31 56 -4 -2 -10 27 31 54 17 60 67 36 16 6 36 12 2 62 3 61 26 0 55 10 34 29 21 65 -5 52 -8 30 35 19 17 36 7 45 40 2 -14 51 11 2 6 16 47 -3 -14 44 61 4 56 1 37 7 -15 37 2 42 -11 22 36 21 30 2 25 37 51 6 42 -9 40 36 2 -13 41 -3 46 19 -3 61 32 12 2 -6 -1 16 62 52 29 7 41 27 52 44 -15 42 7 47 -15 19 35 42 47 55 -10 56 6 34 52 26 66 57 51 6 52 47 64 7 47 57 -13 46 49 25 21 24 -4 30 58 37 65 7 -10 25 41 22 12 21 -1 41 1 9 29 23 15 56 49 56 34 25 45 18 -2 35 60 46 -10 66 -10 15 62 17 51 45 35 52 61 35 30 12 7 42 64 56 -5 58 10 25 29 56 47 -5 47 57 30 67 26 33 66 11 30 34 48 -6 4 59 21 36 -9 12 47 0 67 30 25 16 42 36 -5 -9 29 20 62 42 67 4 42 13 61 14 30 -3 17 -12 8 40 36 32 15 26 6 12 15 66 7 17 -15 57 31 12 15 62 19 47 16 7 26 16 5 25 39 65 61 47 2 24 -14 0 -4 2 -4 -16 2 6 -13 22 65 42 17 24 36 24 47 41 -4 21 -8 64 7 -10 -4 -8 34 44 53 -1 -10 32 27 11 46 19 40 62 -13 54 -11 3 36 49 26 12 42 -14 6 -11 27 32 61 67 62 37 -4 66 52 65 32 41 31 52 25 20 16 57 12 50 44 2 5 1 47 33 9 -16 35 56 59 51 16 1 5 57 25 11 27 -5 16 0 36 16 62 52 -2 12 -15 -8 65 52 0 -3 27 -4 0 67 6 -5 46 -11 -15 67 56 7 8 20 40 24 7 31 -9 34 32 66 58 -5 50 64 -7 54 -11 -13 27 -1 57 -3 56 12 17 39 36 41 1 -3 -10 65 -13 10 20 51 41 -9 -5 41 27 66 40 35 40 57 64 -10 32 6 67 -3 29 15 67 42 57 66 -9 24 31 47 57 -3 31 40 9 22 43 25 4 -4 62 2 42 64 -14 -9 36 62 48 9 -15 20 -15 59 27 41 22 2 6 36 -5 66 6 11 -3 -17 65 10 0 60 36 27 10 66 30 42 52 32 38 7 -13 15 51 -14 31 -4 46 17 37 -3 7 17 66 36 -3 41 46 61 51 12 -9 -14 9 7 21 11 52 1 -15 27 51 46 25 -3 44 7 37 -3 41 56 4 41 12 -3 65 59 65 15 62 29 62 7 -13 22 -13 4 -15 6 42 50 27 22 -5 52 56 67 -3 36 2 55 0 22 57 26 -13 6 -9 45 51 56 2 -3 27 -13 10 -4 65 11 6 17 27 66 61 65 -5 -13 37 32 67 25 29 42 11 37 59 0 -9 62 -4 2 66 45 57 -5 -13 2 22 -8 54 37 60 -13 8 4 -16 34 -1 27 54 37 10 22 52 6 -6 22 46 31 27 20 42 0 -14 27 2 37 22 7 44 66 10 -4 14 -13 32 27 60 42 -5 47 65 37 41 -4 17 37 60 -1 41 -13 17 5 64 -3 22 32 17 30 26 61 40 35 44 26 45 31 6 65 10 21 7 -9 27 56 12 44 2 6 36 -14 66 7 64 47 27 2 7 17 32 -5 6 50 66 47 55 47 28 66 6 67 10 -13 50 11 -13 11 36 0 55 46 62 5 52 14 67 6 -1 24 56 -6 -8 0 -13 23 60 -1 61 -4 57 49 -10 61 -13 54 11 1 25 -1 47 65 21 67 2 67 29 36 50 39 57 -15 56 7 1 55 51 30 50 40 59 20 -3 21 56 10 2 50 -13 57 50 -8 40 2 35 46 -2 29 61 17 55 2 58 1 29 -8 10 27 36 49 57 40 2 66 -10 24 66 -8 42 27 -5 16 37 21 0 54 49 15 52 55 11 32 60 12 31 61 26 6 10 17 1 45 20 49 4 52 21 50 2 57 -14 31 24 16 27 66 27 6 37 15 2 26 57 61 -13 34 -8 55 46 -11 51 15 3 10 -6 52 42 -3 -8 23 39 4 65 -13 39 57 47 7 39 54 47 -11 26 40 17 7 -11 -4 50 27 11 47 6 -13 51 -1 15 22 49 67 17 37 10 -5 42 2 35 66 28 48 66 17 60 -9 67 32 66 36 17 56 -3 10 7 17 47 61 -13 47 61 5 54 -4 62 12 56 -8 19 7 26 22 67 9 61 -15 -3 2 50 -15 -5 60 3 22 -9 34 50 42 13 65 -5 -11 11 29 -9 25 63 -10 19 66 -3 67 40 31 66 22 66 56 2 6 16 12 -14 26 42 20 61 2 -8 33 -4 -16 32 56 36 17 23 57 -10 22 42 19 -13 9 15 -8 26 67 1 6 21 44 7 25 17 11 -14 -1 52 -8 7 22 16 25 50 -15 11 66 -1 36 10 -13 24 61 7 2 -8 51 -15 20 31 0 -13 12 32 50 30 40 -9 2 32 24 21 56 -14 67 55 1 -9 24 60 45 60 67 16 12 18 61 30 36 56 51 36 21 64 -1 44 34 50 2 -4 7 45 -8 66 2 -17 6 35 39 -8 47 17 -10 59 31 22 11 -3 41 9 -8 1 39 4 15 -8 46 -15 5 67 -16 -10 47 16 -3 42 -13 52 30 55 17 66 47 17 -16 42 22 6 -3 1 -3 12 6 29 41 31 42 65 51 55 46 42 21 47 62 36 50 29 7 11 -13 56 -1 16 7 53 20 26 39 -9 67 51 37 15 66 7 51 2 41 64 52 56 2 47 52 3 0 21 40 35 6 -10 46 11 -11 8 41 26 20 40 -5 57 15 -5 32 -10 32 62 22 24 -10 47 61 27 7 50 55 37 50 62 26 1 67 32 25 62 46 51 -8 66 -15 17 -16 -3 22 67 32 10 22 -15 51 16 66 16 -4 56 -4 41 7 56 32 9 5 47 -9 -13 52 30 -6 27 17 57 16 10 -3 5 -15 62 -7 6 0 59 -8 67 12 34 10 6 37 21 36 65 52 61 15 27 -8 16 52 -5 17 66 11 16 -3 -15 -8 24 66 -8 -16 2 40 56 62 12 14 32 5 1 39 -15 51 -8 57 5 -9 16 11 52 35 47 25 15 1 31 -16 12 31 41 -8 45 17 54 21 2 8 38 66 -10 55 -4 12 42 26 47 -10 2 56 -6 49 24 43 -6 7 11 27 47 2 -4 48 56 66 52 61 -9 47 20 36 24 -16 27 64 20 7 62 -13 28 -8 60 22 2 -16 -3 35 41 -1 62 -8 57 25 40 -13 40 66 29 50 16 41 33 55 16 45 37 40 27 51 -10 10 -4 62 36 32 -4 41 62 37 -8 14 7 -13 39 66 5 37 -4 30 -3 31 62 20 12 57 13 32 9 35 62 -13 27 -8 62 42 32 44 2 61 37 -14 9 47 37 40 13 42 -5 22 12 50 42 60 28 18 -8 14 10 1 -8 44 5 -8 46 -16 32 48 46 0 32 -15 57 61 27 57 11 37 15 7 31 42 -3 1 57 7 11 66 46 30 20 -8 -13 5 45 -8 26 60 32 -5 15 51 -8 -5 42 -16 67 55 52 12 22 -6 11 66 32 46 12 16 47 61 55 60 56 37 40 26 -16 62 57 11 31 -1 42 -13 -3 52 0 49 5 -8 1 16 -15 32 -9 32 7 21 -7 63 60 -13 45 21 12 15 42 -5 35 61 26 51 67 61 52 1 50 -8 10 41 49 12 7 21 17 52 22 -13 -1 -5 25 4 27 -10 31 17 8 45 16 60 40 47 7 34 59 18 35 2 -10 50 36 47 42 30 9 -3 37 62 -9 55 26 -8 -12 55 67 -4 51 -15 16 37 31 14 -10 67 22 -12 6 22 -11 6 12 57 42 50 9 51 17 7 -11 25 37 32 66 21 -14 35 0 15 49 36 2 10 42 25 0 51 55 66 8 21 11 -14 50 27 55 52 -9 1 21 32 -10 -4 46 -8 -3 -16 5 62 6 12 -5 59 2 -9 29 14 61 17 44 20 -8 43 24 40 1 12 67 17 52 56 25 5 66 17 32 8 35 45 14 63 41 -11 47 -8 2 61 -8 66 -15 39 22 47 16 55 15 52 11 16 37 -5 29 -5 -13 52 36 50 -3 56 66 -13 31 45 9 5 62 -9 37 50 56 21 50 11 2 46 19 37 -8 47 20 -1 42 -3 -4 20 52 45 27 -3 6 27 51 31 60 57 45 -8 37 32 -1 15 12 21 27 35 -9 41 10 40 45 2 56 26 67 37 22 67 31 22 7 -1 -3 12 57 31 16 67 5 21 29 62 26 46 19 31 27 57 41 46 56 51 11 7 20 -3 -14 9 2 37 2 -3 66 40 1 57 46 22 17 56 -4 27 48 15 10 19 60 52 59 -13 27 12 7 2 60 4 31 36 61 14 11 36 16 -9 17 67 20 42 46 66 47 5 66 24 36 -15 24 22 31 57 32 5 11 35 7 36 6 2 15 -5 15 61 12 58 21 -15 15 40 12 2 6 57 45 41 -10 56 37 -7 65 57 41 9 2 17 47 54 20 42 32 47 4 29 39 -5 25 51 9 -8 57 17 11 67 18 61 27 2 24 32 -13 -3 52 20 42 66 21 11 36 62 46 14 -3 -15 -4 32 35 -10 67 -5 47 41 -3 59 36 45 32 59 40 47 61 52 -7 58 25 20 32 16 52 12 1 -3 66 -3 66 19 -4 -8 10 17 36 16 -10 12 42 52 67 56 26 16 -15 31 66 -3 40 20 37 11 45 -13 26 30 11 44 13 51 19 60 21 57 49 66 32 60 40 -16 15 62 -13 59 56 51 67 2 4 14 37 19 66 30 2 12 67 42 -16 56 42 -14 19 49 3 -11 31 -15 12 47 27 -16 47 16 52 56 -13 48 41 35 -14 44 22 -13 6 2 52 0 10 49 59 2 17 1 22 61 -8 5 -10 24 55 42 65 16 42 15 8 32 35 25 12 30 67 -13 -2 63 36 -6 10 21 53 52 5 52 7 -6 47 -17 7 45 51 -11 47 22 57 40 1 39 18 63 29 55 -4 21 62 51 -16 26 51 22 -16 0 67 6 67 26 -3 30 49 -8 35 59 35 49 1 45 -3 7 50 32 6 39 27 6 9 60 2 7 47 62 64 -3 55 22 50 20 27 61 26 41 22 -17 51 -4 35 60 -3 51 -13 22 -3 0 59 37 -3 61 46 39 45 -14 31 41 19 32 62 -15 16 -1 37 -8 -13 40 65 5 67 47 52 -11 17 -13 3 14 26 52 12 36 62 17 30 40 29 38 58 28 18 60 -1 5 56 59 -4 15 39 35 -9 12 37 61 64 36 61 -14 48 61 5 25 22 12 40 34 -4 46 -13 22 42 -13 36 7 44 10 -3 3 51 19 2 36 26 67 24 31 1 55 58 0 7 46 66 37 27 5 32 44 32 26 46 12 -8 12 31 64 42 37 52 1 37 32 26 39 60 12 48 21 10 41 29 37 44 1 22 36 67 37 28 27 56 35 17 11 45 42 7 9 -15 27 62 17 42 7 10 -5 7 12 66 -4 52 22 -13 47 61 -9 16 -9 1 17 57 12 39 20 37 26 47 23 45 34 52 36 47 -2 41 65 -12 33 -6 12 67 25 67 27 59 30 60 -6 9 -14 19 -14 -9 17 2 4 15 62 12 -9 35 25 32 40 61 2 13 -9 22 59 -14 -4 -8 37 30 41 -11 2 63 16 -10 65 5 42 32 -6 10 55 42 -1 17 10 17 37 52 -14 12 62 7 49 61 8 29 55 -8 51 57 21 11 50 14 37 -8 25 1 12 36 40 62 27 49 -15 12 67 7 1 49 -9 2 27 67 -8 62 22 -4 24 20 15 -1 22 1 7 30 16 -6 27 7 67 7 17 -14 -11 42 26 3 49 19 61 11 -1 46 1 -14 60 35 62 27 32 -13 17 42 7 44 16 56 1 22 67 41 44 5 2 43 56 7 36
	 [[{{node compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_one_step_on_iterator_408250]

In [66]:
cnn_model = get_cnn_model()
sample_image = np.random.random((1, 528, 528, 3))  # Create a dummy input
cnn_output = cnn_model(sample_image)
print("CNN Output Shape:", cnn_output.shape)


CNN Output Shape: (1, 289, 2304)


In [42]:
# Generate the dataset
pitch_dataset = generate_datasets('../raw_data/sheet_images', '../raw_data/labels.csv', batch_size=32)

# Compile the model
music_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
music_model.fit(pitch_dataset, epochs=25, validation_data=None)  # Add validation data if available


Epoch 1/25


ValueError: Exception encountered when calling TransformerEncoderBlock.call().

[1mDimensions must be equal, but are 2304 and 128 for '{{node music_generation_model_9_1/transformer_encoder_block_9_1/add_1}} = AddV2[T=DT_FLOAT](music_generation_model_9_1/transformer_encoder_block_9_1/layer_normalization_48_1/add_2, music_generation_model_9_1/transformer_encoder_block_9_1/sequential_19_1/dense_48_1/Add)' with input shapes: [?,289,2304], [?,289,128].[0m

Arguments received by TransformerEncoderBlock.call():
  • inputs=tf.Tensor(shape=(None, 289, 2304), dtype=float32)

# Extract Features from CNN 

In [None]:
pitch_feature_extractor = tf.keras.Model(inputs=pitch_model.input, outputs=pitch_model.cnn_model.output)
duration_feature_extractor = tf.keras.Model(inputs=duration_model.input, outputs=duration_model.cnn_model.output)

# Concatenate the outputs along the last dimension
def combine_features(pitch_features, duration_features):
    combined_features = tf.concat([pitch_features, duration_features], axis=-1)
    return combined_features

# Assuming `transformer_encoder` is your Transformer encoder model
combined_features = combine_features(pitch_feature_extractor(image_input), duration_feature_extractor(image_input))
encoded_output = transformer_encoder(combined_features)


In [None]:
# Attempt to integrate the entire model

class CompleteMusicGenerationModel(tf.keras.Model):
    def __init__(self, pitch_feature_extractor, duration_feature_extractor, transformer_encoder, transformer_decoder):
        super().__init__()
        self.pitch_feature_extractor = pitch_feature_extractor
        self.duration_feature_extractor = duration_feature_extractor
        self.transformer_encoder = transformer_encoder
        self.transformer_decoder = transformer_decoder

    def call(self, image, target_sequence):
        pitch_features = self.pitch_feature_extractor(image)
        duration_features = self.duration_feature_extractor(image)
        combined_features = combine_features(pitch_features, duration_features)
        encoded_output = self.transformer_encoder(combined_features)
        decoded_output = self.transformer_decoder(target_sequence, encoded_output)
        return decoded_output

# Initialize and compile the complete model
complete_model = CompleteMusicGenerationModel(
    pitch_feature_extractor=pitch_feature_extractor,
    duration_feature_extractor=duration_feature_extractor,
    transformer_encoder=transformer_encoder,
    transformer_decoder=transformer_decoder
)

complete_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [None]:
# Assuming `train_dataset` has both image inputs and target sequences
complete_model.fit(train_dataset, epochs=25, validation_data=validation_dataset)


# Convert to MusicXML

In [None]:
from music21 import stream, note

def predict_sequence(image):
    sequence = model.predict(image)
    return decode_sequence(sequence)  # Implement a function to decode sequences

def convert_to_musicxml(sequence):
    s = stream.Stream()
    for sym in sequence:
        if sym == 'C4 Quarter':
            n = note.Note('C4', quarterLength=1.0)
            s.append(n)
        # Handle other symbols similarly
    s.write('musicxml', fp='output.xml')

# Inference
new_image = load_image('path_to_image')  # Load a new image
predicted_sequence = predict_sequence(new_image)
convert_to_musicxml(predicted_sequence)
