In [2]:
import numpy as np
import torch
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI
from symusic import Score
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(config)
midi = Score("dataset/nesmdb/nesmdb_midi/nesmdb_midi/train/000_10_YardFight_00_01GameStart.mid")
tokens = tokenizer(midi)  # calling the tokenizer will automatically detect MIDIs, paths and tokens
# converted_back_midi = tokenizer(tokens)  # PyTorch, Tensorflow and Numpy tensors are supported
tokenizer["BOS_None"]

  super().__init__(tokenizer_config, params)


1

In [9]:
from pathlib import Path
import tqdm

files_paths = list(Path("dataset").glob("**/*.mid"))
# it = tqdm(files_paths,desc="Tokenizing")
print(f"There are {len(files_paths)} mid files.")
# tokenizer.train(vocab_size=30000, files_paths=files_paths)

There are 183839 mid files.


In [7]:
# tokenizer.save("tokenizer.json")
# tokenizer.from_pretrained("tokenizer.json")

In [42]:
from miditok.pytorch_data import DatasetMIDI, DataCollator
from torch.utils.data import DataLoader

dataset = DatasetMIDI(
    files_paths=files_paths,
    tokenizer=tokenizer,
    max_seq_len=-1,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)
dataset[0], len(dataset[0]["input_ids"])

({'input_ids': tensor([  1,   4, 173,  ..., 105, 109, 408])}, 2746)

In [44]:
lens = []
for i in tqdm.tqdm(range(10)):
    A = dataset[i]["input_ids"]
    if A is None:
        continue
    print(len(A))
    lens.append(len(A))
    # break
    # lens.append(len(mid["input_ids"]))

sum(lens) / len(lens)

  0%|          | 0/10 [00:00<?, ?it/s]

2746
674
6640
242


100%|██████████| 10/10 [00:00<00:00, 16.80it/s]

36577
21370
2844
4256
10477





9536.222222222223

In [281]:
from typing import Sequence
from miditok import MusicTokenizer
import torch


class MDataset(torch.utils.data.Dataset):

    def __init__(self, midi_files: Sequence[Path],
                 _tk: MusicTokenizer,
                 seq_len: int = 2048,
                 sliding_step:int = None):
        super().__init__()
        self.source = DatasetMIDI(
            files_paths=midi_files,
            tokenizer=_tk,
            max_seq_len=-1,
            bos_token_id=_tk["BOS_None"],
            eos_token_id=_tk["EOS_None"],
        )
        self.sliding_step = sliding_step if sliding_step is not None else seq_len // 2
        self.next_step = {}
        self.seq_len = seq_len
        self.cache = dict[int, Sequence[float]]()

    def __len__(self):
        return len(self.source)

    def pad_to_multiple(self, arr: list[any]):
        """
        如果数组长度不是指定倍数，填充特定数量的值。
        Args:
            arr: list[int]，输入数组
            multiple: int，目标倍数
            pad_value: int，填充值
        Returns:
            list[int]，处理后的数组
        """
        length = len(arr)
        mask = [1] * length
        remainder = length % self.seq_len  # 计算当前长度是否满足倍数
        if remainder != 0:
            padding = self.seq_len - remainder  # 需要填充的数量
            arr.extend([self.source.tokenizer.pad_token_id] * padding)  # 填充
            mask.extend([0] * padding)  # 填充部分 mask 为 0
        return arr, mask

    def __getitem__(self, index):
        import numpy as np
        if index not in self.cache:
            raw = self.source[index]
            tensor = raw["input_ids"]
            if tensor is None:
                tensor = torch.zeros(10)
            seq = tensor.tolist()
            seq = seq + [self.source.eos_token_id]
            self.cache[index] = np.array(seq)
        else:
            seq = self.cache[index]

        if index not in self.next_step:
            self.next_step[index] = 0

        step = self.next_step[index]
        if step * self.sliding_step + self.seq_len >= len(seq):
            self.next_step[index] = 0
        else:
            self.next_step[index] += 1

        raw_seq = list(seq[step * self.sliding_step:][:self.seq_len])
        pad_seq, mask = self.pad_to_multiple(raw_seq)
        X = pad_seq[:-1]
        X_mask = mask[:-1]
        Y = pad_seq[1:]
        Y_mask = mask[1:]
        return torch.tensor(X), torch.tensor(Y), torch.tensor(X_mask), torch.tensor(Y_mask)


sd = MDataset(files_paths, tokenizer)
tgt_idx = 0
D0 = sd[0]
D1 = sd[0]
# X1,Y1 = sd[1]
# len(X1),len(Y1)
D0[0],D1[0]

(tensor([  1,   4, 173,  ...,  48, 102, 114]),
 tensor([ 52, 101, 118,  ...,   0,   0,   0]))

In [280]:
sd = MDataset(files_paths,tokenizer, 2048)
dataloader = DataLoader(sd, batch_size=1)
D = next(iter(dataloader))
# X_1, Y_1 = next(iter(dataloader))
D[0],D[1]

(tensor([[  1,   4, 173,  ...,  48, 102, 114]]),
 tensor([[  4, 173, 408,  ..., 102, 114, 282]]))

In [46]:
import tqdm

for epoch in range(10):
    for batch in tqdm.tqdm(dataloader, desc=f"{epoch} Traning"):
        pass
        # print("ok")





0 Traning:   0%|          | 0/83 [00:00<?, ?it/s][A[A[A[A



0 Traning:   1%|          | 1/83 [00:00<01:07,  1.21it/s][A[A[A[A



0 Traning:   2%|▏         | 2/83 [00:01<00:57,  1.40it/s][A[A[A[A



0 Traning:   4%|▎         | 3/83 [00:01<00:49,  1.62it/s][A[A[A[A



0 Traning:   5%|▍         | 4/83 [00:02<00:47,  1.66it/s][A[A[A[A



0 Traning:   6%|▌         | 5/83 [00:02<00:42,  1.82it/s][A[A[A[A



0 Traning:   7%|▋         | 6/83 [00:03<00:42,  1.83it/s][A[A[A[A



0 Traning:   8%|▊         | 7/83 [00:04<00:39,  1.91it/s][A[A[A[A



0 Traning:  10%|▉         | 8/83 [00:04<00:35,  2.13it/s][A[A[A[A



0 Traning:  11%|█         | 9/83 [00:04<00:33,  2.19it/s][A[A[A[A



0 Traning:  12%|█▏        | 10/83 [00:05<00:32,  2.28it/s][A[A[A[A



0 Traning:  13%|█▎        | 11/83 [00:05<00:30,  2.38it/s][A[A[A[A



0 Traning:  14%|█▍        | 12/83 [00:06<00:30,  2.30it/s][A[A[A[A



0 Traning:  16%|█▌        | 13/83 [00:06<00:33,  2.1

KeyboardInterrupt: 