In [1]:
import os
import sys

from matplotlib import pyplot as plt
from matplotlib import patches
from matplotlib import colors
import pretty_midi
import pandas as pd
import IPython.display as ipd
import glob
import numpy as np
import muspy
import pypianoroll
import torch
from util.play_midi import play_midi

from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetTok, DataCollator
from pathlib import Path
from symusic import Score
from torch.utils.data import DataLoader


In [103]:
config = TokenizerConfig()
tokenizer = REMI()
midi_files = list(Path("data/jazz_midi/").glob("**/*.mid"))
print(f"number of midi_files = {len(midi_files)}")
print(f"number of vocab = {len(tokenizer.vocab)}")

number of midi_files = 934
number of vocab = 282


In [55]:
print(len(midi_files)) 
for file in midi_files: # removing corrupted files/ files that cant be fully understood
    try:
        Score(file)
    except:
        midi_files.remove(file)
print(len(midi_files))
for file in midi_files: # on windows this has to be run 2 times, i have no idea why.... very wierd
    try:
        Score(file)
    except:
        midi_files.remove(file)
print(len(midi_files))

934
914
914


In [63]:
song = Score(midi_files[0]) 
tokenized_song = tokenizer.midi_to_tokens(song)

song1 =tokenizer.tokens_to_midi(tokenized_song)
ids = tokenizer._tokens_to_ids(tokenized_song[0])
tokenizer._ids_to_tokens(ids)

['Bar_None',
 'Position_0',
 'Pitch_76',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_4',
 'Pitch_78',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_8',
 'Pitch_74',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_12',
 'Pitch_71',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_20',
 'Pitch_73',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_24',
 'Pitch_69',
 'Velocity_95',
 'Duration_0.4.8',
 'Bar_None',
 'Position_0',
 'Pitch_64',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_4',
 'Pitch_66',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_8',
 'Pitch_62',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_12',
 'Pitch_59',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_20',
 'Pitch_61',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_24',
 'Pitch_57',
 'Velocity_95',
 'Duration_0.4.8',
 'Bar_None',
 'Position_0',
 'Pitch_52',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_4',
 'Pitch_54',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_8',
 'Pitch_50',
 'Velocity_95',
 'Duration_0.4.8',
 'Position

In [91]:
tokenizer.learn_bpe(vocab_size=500, files_paths=midi_files)







In [109]:
len(tokenizer.vocab)
tokenizer["PAD_None"]
tokenizer["BOS_None"]

# config = AutoConfig.from_pretrained(
#     "gpt2",
#     vocab_size=len(tokenizer),
#     n_positions=context_length,
#     n_layer=n_layer,
#     n_head=n_head,
#     pad_token_id=tokenizer.pad_token_id,
#     bos_token_id=tokenizer.bos_token_id,
#     eos_token_id=tokenizer.eos_token_id,
#     n_embd=n_emb
# )

1

In [78]:
from transformers import DataCollatorForLanguageModeling

dataset = DatasetTok( # seq_len = start + seq_len + end
    files_paths=midi_files,
    min_seq_len=50, # 52
    max_seq_len=3000, # 514
    tokenizer=tokenizer
)

collator = DataCollator(
    tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"], copy_inputs_as_labels=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

data_loader = DataLoader(dataset=dataset, batch_size=256, collate_fn=collator,)

Loading data: data/jazz_midi: 100%|██████████| 914/914 [00:39<00:00, 22.90it/s]


In [79]:
for batch in data_loader:
    x, y, z = **batch
    break
y = **x 

{'input_ids': tensor([[  1,   4, 189,  ...,   0,   0,   0],
         [  1,   4,   4,  ..., 132,   4,   2],
         [  1, 189,  15,  ...,   0,   0,   0],
         ...,
         [  1,   4,   4,  ...,   0,   0,   0],
         [  1,   4,   4,  ...,   0,   0,   0],
         [  1,   4, 205,  ...,   0,   0,   0]]),
 'labels': tensor([[   1,    4,  189,  ..., -100, -100, -100],
         [   1,    4,    4,  ...,  132,    4,    2],
         [   1,  189,   15,  ..., -100, -100, -100],
         ...,
         [   1,    4,    4,  ..., -100, -100, -100],
         [   1,    4,    4,  ..., -100, -100, -100],
         [   1,    4,  205,  ..., -100, -100, -100]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)}

In [85]:
y = tokenizer._ids_to_tokens(x["input_ids"][0].tolist())


In [87]:
from transformers import AutoConfig, GPT2LMHeadModel

device = "cuda" if torch.cuda.is_available() else "cpu"



282

In [112]:
context_length = 128
n_layer = 3
n_head = 4
n_emb = 64 # 512

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_positions=context_length,
    n_layer=n_layer,
    n_head=n_head,
    pad_token_id=tokenizer["PAD_None"],
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
    n_embd=n_emb
)


In [113]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size} parameters")
model

GPT-2 size: 176320 parameters


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(282, 64)
    (wpe): Embedding(128, 64)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x GPT2Block(
        (ln_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=64, out_features=282, bias=False)
)

In [119]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epoch = 10
for i in range(epoch):
    total_loss = []
    model.train()
    for batch in data_loader:
        model.zero_grad()
        full = torch.tensor(batch).to(device)
        # full = batch.toTensor().to(device)
        inputs = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        mask = batch["attention_mask"].to(device)
        outputs = model(**full)
        loss = outputs.loss
        loss.backwards()
        optimizer.step()
        total_loss.append(loss)
    print(f"Epock = {epoch+1} \nloss = {loss}")


RuntimeError: Could not infer dtype of dict