# Pre-process POP909 for Polyphemus
- split 909 into phrases instead of bars
- Phrases are of varying bar-length --> Add padding??
- Use 4/4 only

In [1]:
import re
import os
import time
import sys
import multiprocessing
import itertools
import argparse
from itertools import product

import numpy as np
from tqdm import tqdm
import pypianoroll as pproll
import muspy

In [2]:
midi_folder = "POP909"
structure_folder = "POP909_structure"

In [3]:
from preprocess_pop909 import *

In [4]:
midi_dataset_dir = "POP909"
structure_dir = "POP909_structure"
song_idx = "001"
dest_dir = "preprocessed_909_mask"
n_bars = 8
resolution = 8

In [5]:
preprocess_midi_file(midi_dataset_dir, song_idx, structure_dir, dest_dir, n_bars, resolution)

Song accepted! POP909/001/001.mid with time signature 2/4
saved to preprocessed_909_mask/001.mid0
saved to preprocessed_909_mask/001.mid1
saved to preprocessed_909_mask/001.mid2


## Debug

In [59]:
filepath = "POP909/001/001.mid"
structure_path = "POP909_structure/001/human_label1.txt"

In [60]:
pproll_song = pproll.read(filepath, resolution=8)
muspy_song = muspy.read(filepath)

In [61]:
f = open(f"{structure_path}", "r")
structure = f.read()
phrases = split_string(structure)

In [62]:
phrases

['i4',
 'A4',
 'B8',
 'A4',
 'A4',
 'b4',
 'B8',
 'A4',
 'A4',
 'b4',
 'b4',
 'A4',
 'A4',
 'b4',
 'A4',
 'o3']

In [63]:
len(phrases)

16

In [64]:
phrase_songs = split_song_into_phrases(pproll_song, phrases, resolution)

In [65]:
phrase_songs[1][0]

Music(metadata=Metadata(schema_version='0.1'), resolution=8, tracks=[Track(program=0, is_drum=False, name='MELODY', notes=[Note(time=165, pitch=66, duration=2, velocity=64), Note(time=169, pitch=63, duration=2, velocity=64), Note(time=173, pitch=68, duration=11, velocity=64), ...]), Track(program=0, is_drum=False, name='BRIDGE', notes=[Note(time=133, pitch=66, duration=2, velocity=64), Note(time=137, pitch=75, duration=2, velocity=64), Note(time=141, pitch=73, duration=13, velocity=64), ...]), Track(program=0, is_drum=False, name='PIANO', notes=[Note(time=129, pitch=42, duration=14, velocity=64), Note(time=131, pitch=49, duration=11, velocity=64), Note(time=133, pitch=54, duration=9, velocity=64), ...])])

In [67]:
tracks_notes = [track.notes for track in phrase_songs[-1][0].tracks]

In [68]:
max_phrase_len_res = 4*resolution*constants.MAX_PHRASE_LEN
max_phrase_len_res

256

In [69]:
first_note//(resolution*4)

16

In [73]:
tracks_content = []
tracks_structure = []
phrase_len=3

print(tracks_notes)
first_note = np.inf
for notes in tracks_notes:
    track_first_note = min(note.time for note in notes) if notes else np.inf
    first_note = min(first_note, track_first_note)

# Calculate time offset from first note (ASSUME first note is in first bar)
print(first_note)
t_offset = (first_note//(resolution*4)) * resolution*4
print(t_offset)

for notes in tracks_notes:

    # track_content: length x MAX_SIMU_TOKENS x 2
    # This is used as a basis to build the final content tensors for
    # each sequence.
    # The last dimension contains pitches and durations. int16 is enough
    # to encode small to medium duration values.
    track_content = np.zeros((max_phrase_len_res, constants.MAX_SIMU_TOKENS, 2), 
                            np.int16)

    track_content[:, :, 0] = PitchToken.PAD.value
    track_content[:, 0, 0] = PitchToken.SOS.value
    track_content[:, :, 1] = DurationToken.PAD.value
    track_content[:, 0, 1] = DurationToken.SOS.value

    # Keeps track of how many notes have been stored in each timestep
    # (int8 imposes MAX_SIMU_TOKENS < 256)
    notes_counter = np.ones(max_phrase_len_res, dtype=np.int8)
    # print(notes_counter)

    # t_offset = None

    # Todo: np.put_along_axis?
    for note in notes:
        # Insert note in the lowest position available in the timestep
        # if t_offset is None:
        #     # print((note.time//resolution*4))
        #     t_offset = (note.time//(resolution*4)) * resolution*4
        # print(t_offset)
            
        t = note.time - t_offset
        print(t)

        if notes_counter[t] >= constants.MAX_SIMU_TOKENS-1:
            # Skip note if there is no more space
            # print(f"skipping note: {note}")
            continue
            
        pitch = max(min(note.pitch, constants.MAX_PITCH_TOKEN), 0)
        track_content[t, notes_counter[t], 0] = pitch
        dur = max(min(note.duration, constants.MAX_DUR_TOKEN + 1), 1)
        track_content[t, notes_counter[t], 1] = dur-1
        notes_counter[t] += 1
        # print(f"counting note: {notes_counter[t]}")
    # print(f"num notes: {notes_counter}")
    # Add EOS token
    t_range = np.arange(0, max_phrase_len_res)
    track_content[t_range, notes_counter, 0] = PitchToken.EOS.value
    track_content[t_range, notes_counter, 1] = DurationToken.EOS.value

    # Get track activations, a boolean tensor indicating whether notes
    # are being played in a timestep (sustain does not count)
    # (needed for graph rep.)
    activations = np.array(notes_counter-1, dtype=bool).astype(int)
    # print(np.array(notes_counter-1, dtype=bool))
    # Mask activations
    activations[4*resolution*phrase_len:] = constants.STRUCTURE_PAD
    # print(f"Padding after {phrase_len} bars: {activations}")

    tracks_content.append(track_content)
    tracks_structure.append(activations)

[[], [Note(time=2205, pitch=66, duration=3, velocity=64), Note(time=2209, pitch=75, duration=3, velocity=64), Note(time=2213, pitch=73, duration=1, velocity=64), Note(time=2217, pitch=71, duration=2, velocity=64), Note(time=2221, pitch=80, duration=1, velocity=64), Note(time=2223, pitch=82, duration=1, velocity=64), Note(time=2225, pitch=80, duration=9, velocity=64), Note(time=2237, pitch=66, duration=2, velocity=64), Note(time=2241, pitch=75, duration=3, velocity=64), Note(time=2245, pitch=73, duration=5, velocity=64), Note(time=2253, pitch=70, duration=12, velocity=64), Note(time=2269, pitch=73, duration=2, velocity=64)], [Note(time=2177, pitch=42, duration=28, velocity=64), Note(time=2177, pitch=54, duration=5, velocity=64), Note(time=2177, pitch=58, duration=4, velocity=64), Note(time=2181, pitch=49, duration=4, velocity=64), Note(time=2185, pitch=58, duration=4, velocity=64), Note(time=2185, pitch=61, duration=4, velocity=64), Note(time=2189, pitch=49, duration=5, velocity=64), No

In [26]:
for phrase_song, phrase_len in phrase_songs:

    tracks_notes = [track.notes for track in phrase_song.tracks]
    c_tensor, s_tensor = process_track_notes(tracks_notes, resolution, phrase_len)
    print(phrase_len, s_tensor)

4 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0
  0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
  0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2

In [16]:
tracks_notes

[[Note(time=165, pitch=66, duration=2, velocity=64),
  Note(time=169, pitch=63, duration=2, velocity=64),
  Note(time=173, pitch=68, duration=11, velocity=64),
  Note(time=193, pitch=68, duration=2, velocity=64),
  Note(time=197, pitch=65, duration=1, velocity=64),
  Note(time=201, pitch=61, duration=1, velocity=64),
  Note(time=205, pitch=66, duration=8, velocity=64),
  Note(time=229, pitch=66, duration=2, velocity=64),
  Note(time=233, pitch=63, duration=1, velocity=64),
  Note(time=237, pitch=68, duration=5, velocity=64),
  Note(time=245, pitch=61, duration=1, velocity=64),
  Note(time=249, pitch=68, duration=1, velocity=64),
  Note(time=253, pitch=66, duration=14, velocity=64)],
 [Note(time=133, pitch=66, duration=2, velocity=64),
  Note(time=137, pitch=75, duration=2, velocity=64),
  Note(time=141, pitch=73, duration=13, velocity=64),
  Note(time=167, pitch=82, duration=1, velocity=64),
  Note(time=185, pitch=80, duration=2, velocity=64),
  Note(time=213, pitch=82, duration=2, vel

In [17]:
first_note = max_phrase_len_res
for notes in tracks_notes:
    track_first_note = min(note.time for note in notes) if notes else max_phrase_len_res
    first_note = min(first_note, track_first_note)

In [18]:
t_offset = (first_note//(resolution*4)) * resolution*4
t_offset

128

In [56]:
tracks_notes = [track.notes for track in muspy_song.tracks]

# Obtain length of subsong (maximum of each track's length)
length = 0
for notes in tracks_notes:
    track_length = max(note.end for note in notes) if notes else 0
    length = max(length, track_length)
length += 1

In [57]:
length

134562

In [58]:
resolution=8
# Add timesteps until length is a multiple of resolution
length = length if length % (4*resolution) == 0 \
    else length + (4*resolution-(length % (4*resolution)))

In [59]:
length

134592

In [23]:
muspy_song.tracks[0].notes[0]

Note(time=9160, pitch=61, duration=69, velocity=115)

In [15]:
pproll_song

Multitrack(name=None, resolution=8, tempo=array(shape=(7080, 1), dtype=float64), downbeat=array(shape=(2360, 1), dtype=bool), tracks=[StandardTrack(name='MELODY', program=0, is_drum=False, pianoroll=array(shape=(2360, 128), dtype=uint8)), StandardTrack(name='BRIDGE', program=0, is_drum=False, pianoroll=array(shape=(2360, 128), dtype=uint8)), StandardTrack(name='PIANO', program=0, is_drum=False, pianoroll=array(shape=(2360, 128), dtype=uint8))])

In [16]:
pproll_song.tracks

[StandardTrack(name='MELODY', program=0, is_drum=False, pianoroll=array(shape=(3792, 128), dtype=uint8)),
 StandardTrack(name='BRIDGE', program=0, is_drum=False, pianoroll=array(shape=(3792, 128), dtype=uint8)),
 StandardTrack(name='PIANO', program=0, is_drum=False, pianoroll=array(shape=(3792, 128), dtype=uint8))]

In [45]:
import torch

In [46]:
# !ls preprocessed_909_mask

In [74]:
sample_path = os.path.join("preprocessed_909_mask", "001.mid9.npz")
data = np.load(sample_path)
c_tensor = torch.tensor(data["c_tensor"], dtype=torch.long)
s_tensor = torch.tensor(data["s_tensor"], dtype=torch.int)

In [75]:
c_tensor.shape

torch.Size([3, 256, 16, 2])

In [76]:
s_tensor.shape

torch.Size([3, 256])

In [77]:
s_tensor

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

In [87]:
sum(s_tensor[2]==1)

tensor(25)

In [94]:
s_tensor.reshape(-1).bool().sum()

tensor(422)

In [95]:
(s_tensor==1).reshape(-1).bool().sum()

tensor(38)

In [101]:
c_tensor[(s_tensor==1)].shape

torch.Size([38, 16, 2])