# Data Preparation

In [33]:
import tensorflow as tf
import os
import csv
import ast

# Preprocess images
def preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=1)
    img = tf.image.resize(img, (528, 528))  # Resize the image
    img = tf.cast(img, tf.float32) / 127.5 - 1.0  # Normalize to [-1, 1] range
    img = tf.image.grayscale_to_rgb(img)  # Convert grayscale to RGB by duplicating the channel
    return img

# Define the function to encode pitch and duration into a single integer
def encode_labels(pitch, duration, num_durations):
    return pitch * num_durations + duration

# Generate datasets
def generate_datasets(image_folder, label_file, batch_size=32, num_durations=5):
    image_paths = []
    encoded_labels = []

    with open(label_file, 'r') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header
        for row in reader:
            filename = row[0]
            labels = ast.literal_eval(row[1])  # Parse the string to get the list of tuples

            # Encode each (pitch, duration) pair into a single integer
            labels_encoded = [encode_labels(p, d, num_durations) for p, d in labels]

            # Add to lists
            image_paths.append(os.path.join(image_folder, filename))
            encoded_labels.append(labels_encoded)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, encoded_labels))

    # Apply preprocessing to images and pack image and label together
    dataset = dataset.map(lambda x, y: ((preprocess_image(x), y), y))  # Pack image and label for model input

    # Batching and prefetching
    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset


In [34]:
# Generate the dataset
pitch_dataset = generate_datasets('../raw_data/sheet_images', '../raw_data/labels.csv', batch_size=32)

# Model Architecture

In [35]:
from tensorflow.keras.applications import EfficientNetB6
from tensorflow.keras.layers import Reshape, Input
from tensorflow.keras.models import Model
import tensorflow as tf

# CNN model (feature extractor)
def get_cnn_model():
    base_model = EfficientNetB6(
        input_shape=(528, 528, 3),
        include_top=False,
        weights="imagenet"
    )
    base_model.trainable = False  # Freeze the model to use it as a feature extractor

    # Reshape the output to prepare it for the Transformer Encoder
    base_model_out = base_model.output
    base_model_out = Reshape((-1, base_model_out.shape[-1]))(base_model_out)

    cnn_model = Model(base_model.input, base_model_out)
    return cnn_model

In [36]:
from tensorflow.keras.layers import Layer, MultiHeadAttention, Dense, LayerNormalization, Embedding, Dropout
import tensorflow as tf

class TransformerEncoderBlock(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads):
        super().__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = tf.keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs):
        attention_output = self.attention(inputs, inputs)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

# Transformer Decoder Block
class TransformerDecoderBlock(Layer):
    def __init__(self, embed_dim, ff_dim, num_heads, vocab_size):
        super().__init__()
        self.attention_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.embedding = Embedding(vocab_size, embed_dim)
        self.dropout_1 = Dropout(0.3)
        self.dropout_2 = Dropout(0.5)
        self.out = Dense(vocab_size, activation="softmax")

    def call(self, inputs, encoder_outputs, training, mask=None):
        inputs = self.embedding(inputs)
        attention_output_1 = self.attention_1(query=inputs, value=inputs, key=inputs)
        out_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(query=out_1, value=encoder_outputs, key=encoder_outputs)
        out_2 = self.layernorm_2(out_1 + attention_output_2)
        ffn_output = self.ffn(out_2)
        ffn_output = self.dropout_1(ffn_output, training=training)
        return self.out(ffn_output)

# Music Generation Model
class MusicGenerationModel(tf.keras.Model):
    def __init__(self, cnn_model):
        super().__init__()
        self.cnn_model = cnn_model
        self.encoder = TransformerEncoderBlock(embed_dim=128, dense_dim=512, num_heads=8)
        self.decoder = TransformerDecoderBlock(embed_dim=128, ff_dim=512, num_heads=8, vocab_size=85)

    def call(self, inputs):
        image, target = inputs  # Unpack the inputs
        cnn_features = self.cnn_model(image)
        encoded_features = self.encoder(cnn_features)
        output = self.decoder(target, encoded_features)
        return output


In [37]:
cnn_model = get_cnn_model()
music_model = MusicGenerationModel(cnn_model)

# Model Training

In [38]:
# Load the datasets
pitch_dataset, duration_dataset = generate_datasets('../raw_data/sheet_images', '../raw_data/labels.csv')

# Define the model architecture (same as before)
pitch_model = MusicGenerationModel(cnn_model)
duration_model = MusicGenerationModel(cnn_model)

# Compile the models
pitch_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
duration_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Train the models separately
pitch_model.fit(pitch_dataset, epochs=25, validation_data=None)  # Add validation data if available
duration_model.fit(duration_dataset, epochs=25, validation_data=None)  # Add validation data if available


ValueError: too many values to unpack (expected 2)

In [39]:
# Compile the model
music_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
music_model.fit(pitch_dataset, epochs=25, validation_data=None)  # Add validation data if available


Epoch 1/25


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling TransformerEncoderBlock.call().

[1mDimensions must be equal, but are 2304 and 128 for '{{node transformer_encoder_block_9_1/add_1}} = AddV2[T=DT_FLOAT](transformer_encoder_block_9_1/layer_normalization_48_1/add_2, transformer_encoder_block_9_1/sequential_19_1/dense_48_1/Add)' with input shapes: [?,289,2304], [?,289,128].[0m

Arguments received by TransformerEncoderBlock.call():
  • inputs=tf.Tensor(shape=(None, 289, 2304), dtype=float32)''


ValueError: Exception encountered when calling TransformerEncoderBlock.call().

[1mDimensions must be equal, but are 2304 and 128 for '{{node music_generation_model_9_1/transformer_encoder_block_9_1/add_1}} = AddV2[T=DT_FLOAT](music_generation_model_9_1/transformer_encoder_block_9_1/layer_normalization_48_1/add_2, music_generation_model_9_1/transformer_encoder_block_9_1/sequential_19_1/dense_48_1/Add)' with input shapes: [?,289,2304], [?,289,128].[0m

Arguments received by TransformerEncoderBlock.call():
  • inputs=tf.Tensor(shape=(None, 289, 2304), dtype=float32)

# Extract Features from CNN 

In [None]:
pitch_feature_extractor = tf.keras.Model(inputs=pitch_model.input, outputs=pitch_model.cnn_model.output)
duration_feature_extractor = tf.keras.Model(inputs=duration_model.input, outputs=duration_model.cnn_model.output)

# Concatenate the outputs along the last dimension
def combine_features(pitch_features, duration_features):
    combined_features = tf.concat([pitch_features, duration_features], axis=-1)
    return combined_features

# Assuming `transformer_encoder` is your Transformer encoder model
combined_features = combine_features(pitch_feature_extractor(image_input), duration_feature_extractor(image_input))
encoded_output = transformer_encoder(combined_features)


In [None]:
# Attempt to integrate the entire model

class CompleteMusicGenerationModel(tf.keras.Model):
    def __init__(self, pitch_feature_extractor, duration_feature_extractor, transformer_encoder, transformer_decoder):
        super().__init__()
        self.pitch_feature_extractor = pitch_feature_extractor
        self.duration_feature_extractor = duration_feature_extractor
        self.transformer_encoder = transformer_encoder
        self.transformer_decoder = transformer_decoder

    def call(self, image, target_sequence):
        pitch_features = self.pitch_feature_extractor(image)
        duration_features = self.duration_feature_extractor(image)
        combined_features = combine_features(pitch_features, duration_features)
        encoded_output = self.transformer_encoder(combined_features)
        decoded_output = self.transformer_decoder(target_sequence, encoded_output)
        return decoded_output

# Initialize and compile the complete model
complete_model = CompleteMusicGenerationModel(
    pitch_feature_extractor=pitch_feature_extractor,
    duration_feature_extractor=duration_feature_extractor,
    transformer_encoder=transformer_encoder,
    transformer_decoder=transformer_decoder
)

complete_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


In [None]:
# Assuming `train_dataset` has both image inputs and target sequences
complete_model.fit(train_dataset, epochs=25, validation_data=validation_dataset)


# Convert to MusicXML

In [None]:
from music21 import stream, note

def predict_sequence(image):
    sequence = model.predict(image)
    return decode_sequence(sequence)  # Implement a function to decode sequences

def convert_to_musicxml(sequence):
    s = stream.Stream()
    for sym in sequence:
        if sym == 'C4 Quarter':
            n = note.Note('C4', quarterLength=1.0)
            s.append(n)
        # Handle other symbols similarly
    s.write('musicxml', fp='output.xml')

# Inference
new_image = load_image('path_to_image')  # Load a new image
predicted_sequence = predict_sequence(new_image)
convert_to_musicxml(predicted_sequence)
