In [2]:
! pip install miditok



In [11]:
from miditok import REMI, TokenizerConfig
from symusic import Score
from miditok.pytorch_data import DatasetMIDI, DataCollator
from torch.utils.data import DataLoader


# Creating a multitrack tokenizer, read the doc to explore all the parameters
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)

# Loads a midi, converts to tokens, and back to a MIDI
midi = Score("ty_april.mid")
tokens = tokenizer(midi)  # calling the tokenizer will automatically detect MIDIs, paths and tokens
converted_back_midi = tokenizer(tokens)  # PyTorch, Tensorflow and Numpy tensors are supported

In [9]:
tokens

TokSequence(tokens=['Bar_None', 'Position_20', 'Program_0', 'Pitch_65', 'Velocity_55', 'Duration_0.4.8', 'Position_24', 'Program_0', 'Pitch_58', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_62', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_65', 'Velocity_39', 'Duration_0.4.8', 'Program_0', 'Pitch_67', 'Velocity_55', 'Duration_1.4.8', 'Position_28', 'Program_0', 'Chord_maj', 'Program_0', 'Pitch_58', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_62', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_65', 'Velocity_39', 'Duration_0.4.8', 'Bar_None', 'Position_0', 'Program_0', 'Chord_maj', 'Program_0', 'Pitch_58', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_62', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_65', 'Velocity_31', 'Duration_0.4.8', 'Position_4', 'Program_0', 'Chord_7maj', 'Program_0', 'Pitch_58', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_62', 'Velocity_31', 'Duration_0.4.8', 'Program_0', 'Pitch_65', 'Velocity_39', 'Duration_

In [14]:
midis = ["ty_april.mid"]
print(midis)

# Create a Dataset, a DataLoader and a collator to train a model
dataset = DatasetMIDI(
files_paths=midis,
tokenizer=tokenizer,
max_seq_len=1024,
bos_token_id=tokenizer["BOS_None"],
eos_token_id=tokenizer["EOS_None"])

collator = DataCollator(tokenizer.pad_token_id, copy_inputs_as_labels=True)
dataloader = DataLoader(dataset, batch_size=64, collate_fn=collator)

for batch, encodings in enumerate(dataloader):
    print(encodings["attention_mask"])


['ty_april.mid']
tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)
