# Query and reArrange 

Tutorial for Zhao et al., "Q&A: Query-Based Representation Learning for Multi-Track Symbolic Music re-Arrangement", accepted by IJCAI 2023 Special Track for AI the Arts and Creativity.

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']= '0'
import numpy as np
import torch
from torch.utils.data import DataLoader
from model import Query_and_reArrange
from dataset import Slakh2100_Pop909_Dataset, collate_fn_inference, EMBED_PROGRAM_MAPPING
SLAKH_CLASS_MAPPING = {v: k for k, v in EMBED_PROGRAM_MAPPING.items()}
from utils.format_convert import matrix2midi_with_dynamics, matrix2midi_drum, elem2midi
from utils.inferring import mixture_function_prior, search_reference, velocity_adaption
import datetime

## 1. Symbolic multi-track music rearrangement

Based on composition style transfer, Q&A is a generic model for a range of rearrangement problems, including orchestration, piano cover generation, and re-instrumentation. 

Let's first load the Q&A model. Demo will be saved to `./demo`.

In [2]:
SLAKH2100_DIR = "./data/Slakh2100"
POP909_DIR = "./data/Pop909"
MODEL_DIR = "./checkpoints/Q&A_epoch_029.pt"
SAVE_DIR = './demo'
SAMPLE_BAR_LEN = 8

DEVICE = 'cuda:0'
model = Query_and_reArrange(name='inference_model', device=DEVICE, trf_layers=2)
model.load_state_dict(torch.load(MODEL_DIR))
model.to(DEVICE)
model.eval();

### 1.1 Orchestration

For orchestration, we sample a piano clip $x$ and a multi-track clip $y$, and then orchestrate $x$ using $y$'s track functions (i.e., style).

##### 1.1.1 Loading data

In [3]:
# load piano dataset. A piano piece x is the donor of content.
x_set = Slakh2100_Pop909_Dataset(None, POP909_DIR, 16*SAMPLE_BAR_LEN, split='validation', mode='inference', with_dynamics=True)
# load multi-track dataset. A multi-track piece y is the donor of style.
y_set = Slakh2100_Pop909_Dataset(SLAKH2100_DIR, None, 16*SAMPLE_BAR_LEN, split='validation', mode='inference', with_dynamics=True, with_drums=True)
# Prepare for the heuristic sampling of y
y_set_loader = DataLoader(y_set, batch_size=1, shuffle=False, collate_fn=lambda b: collate_fn_inference(b, DEVICE))
y_prior_set = mixture_function_prior(y_set_loader)

loading Pop909 Dataset ...


100%|██████████| 84/84 [00:00<00:00, 118.29it/s]


loading Slakh2100 Dataset ...


100%|██████████| 214/214 [00:08<00:00, 26.70it/s]


Rendering sample space for style references ...


100%|██████████| 20617/20617 [00:52<00:00, 389.32it/s]


##### 1.1.2 Sampling

In [4]:
# get a random x sample
IDX = np.random.randint(len(x_set))
x = x_set.__getitem__(IDX)
(x_mix, x_instr, x_fp, x_ft), (x_dyn, _, _), x_dir = collate_fn_inference(batch = [(x)], device = DEVICE)
# heuristic sampling for y (i.e., Equation (8) in the paper)
y_anchor = search_reference(x_fp, x_ft, y_prior_set)
y = y_set.__getitem__(y_anchor)
(y_mix, y_instr, y_fp, y_ft), (y_dyn, y_drm, y_dprog), y_dir = collate_fn_inference(batch=[(y)], device=DEVICE)
# exchange x's and y's melody track function in order to preserve the theme melody after rearrangement.
x_mel, y_mel = 0, 0
y_fp[:, y_mel] = x_fp[:, x_mel]
y_ft[:, y_mel] = x_ft[:, x_mel]
y_dyn[y_mel] = x_dyn[x_mel]
# save x and y
save_path = os.path.join(SAVE_DIR, 'orchestration', datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
if not os.path.exists(save_path):
    os.makedirs(save_path)
x_recon = elem2midi(*x, SLAKH_CLASS_MAPPING)
x_recon.write(os.path.join(save_path, '01_piano_solo.mid'))
y_recon = elem2midi(*y, SLAKH_CLASS_MAPPING)
y_recon.write(os.path.join(save_path, '02_reference.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/orchestration/20230602_144828.


##### 1.1.3 Calling Q&A for orchestration

In [5]:
# Q&A model inference
output = model.inference(x_mix, y_instr, y_fp, y_ft, mel_id=y_mel)
# apply y's dynamics to the rearrangement result
velocity = velocity_adaption(y_dyn[..., 0], output, y_mel)
cc = y_dyn[..., 1]
output = np.stack([output, velocity, cc], axis=-1)
# reconstruct MIDI
midi_recon = matrix2midi_with_dynamics(
    matrices=output, 
    programs=[SLAKH_CLASS_MAPPING[item.item()] for item in y_instr[0]], 
    init_tempo=100)
if y_drm is not None:
    drum_recon = matrix2midi_drum(y_drm, y_dprog, init_tempo=100)
    midi_recon.instruments += drum_recon.instruments
midi_recon.write(os.path.join(save_path, '03_orchestration_result.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/orchestration/20230602_144828.


### 1.2 Piano cover generation

For piano cover generation, we sample a multi-track clip $x$ and a piano clip $y$, and then generate piano cover for $x$ using the textures in $y$.

##### 1.2.1 Loading data

In [6]:
# load piano dataset. A piano piece x is the donor of content.
x_set = Slakh2100_Pop909_Dataset(SLAKH2100_DIR, None, 16*SAMPLE_BAR_LEN, split='validation', mode='inference', with_dynamics=True)
# load multi-track dataset. A multi-track piece y is the donor of style.
y_set = Slakh2100_Pop909_Dataset(None, POP909_DIR, 16*SAMPLE_BAR_LEN, split='validation', mode='inference', with_dynamics=True)
# Prepare for the heuristic sampling of y
y_set_loader = DataLoader(y_set, batch_size=1, shuffle=False, collate_fn=lambda b: collate_fn_inference(b, DEVICE))
y_prior_set = mixture_function_prior(y_set_loader)

loading Slakh2100 Dataset ...


100%|██████████| 214/214 [00:05<00:00, 38.24it/s]


loading Pop909 Dataset ...


100%|██████████| 84/84 [00:00<00:00, 155.01it/s]


Rendering sample space for style references ...


100%|██████████| 6080/6080 [00:10<00:00, 580.24it/s]


##### 1.2.2 Sampling

In [7]:
# get a random x sample
IDX = np.random.randint(len(x_set))
x = x_set.__getitem__(IDX)
(x_mix, x_instr, x_fp, x_ft), (x_dyn, _, _), x_dir = collate_fn_inference(batch = [(x)], device = DEVICE)
# heuristic sampling for y (i.e., Equation (8) in the paper)
y_anchor = search_reference(x_fp, x_ft, y_prior_set)
y = y_set.__getitem__(y_anchor)
(y_mix, y_instr, y_fp, y_ft), (y_dyn, y_drm, y_dprog), y_dir = collate_fn_inference(batch=[(y)], device=DEVICE)
# exchange x's and y's melody track function in order to preserve the theme melody after rearrangement.
x_mel, y_mel = 0, 0
y_fp[:, y_mel] = x_fp[:, x_mel]
y_ft[:, y_mel] = x_ft[:, x_mel]
y_dyn[y_mel] = x_dyn[x_mel]
# save x and y
save_path = os.path.join(SAVE_DIR, 'piano_cover', datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
if not os.path.exists(save_path):
    os.makedirs(save_path)
x_recon = elem2midi(*x, SLAKH_CLASS_MAPPING)
x_recon.write(os.path.join(save_path, '01_multi_track.mid'))
y_recon = elem2midi(*y, SLAKH_CLASS_MAPPING)
y_recon.write(os.path.join(save_path, '02_reference.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/piano_cover/20230602_144919.


##### 1.2.3 Calling Q&A for piano cover generation

In [8]:
# Q&A model inference
output = model.inference(x_mix, y_instr, y_fp, y_ft, mel_id=y_mel)
# apply y's dynamics to the rearrangement result
velocity = velocity_adaption(y_dyn[..., 0], output, y_mel)
cc = y_dyn[..., 1]
output = np.stack([output, velocity, cc], axis=-1)
# reconstruct MIDI
midi_recon = matrix2midi_with_dynamics(
    matrices=output, 
    programs=[SLAKH_CLASS_MAPPING[item.item()] for item in y_instr[0]], 
    init_tempo=100)
midi_recon.write(os.path.join(save_path, '03_piano_cover.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/piano_cover/20230602_144919.


### 1.3 Re-instrumentation

For re-instrumentation, we sample multi-track clips $x$ and $y$, and then rearrange $x$ using the track functions in $y$.

##### 1.3.1 Loading data

In [9]:
# load piano dataset. A piano piece x is the donor of content.
x_set = Slakh2100_Pop909_Dataset(SLAKH2100_DIR, None, 16*SAMPLE_BAR_LEN, split='test', mode='inference', with_dynamics=True)
# load multi-track dataset. A multi-track piece y is the donor of style.
y_set = Slakh2100_Pop909_Dataset(SLAKH2100_DIR, None, 16*SAMPLE_BAR_LEN, split='validation', mode='inference', with_dynamics=True, with_drums=True)
# Prepare for the heuristic sampling of y
y_set_loader = DataLoader(y_set, batch_size=1, shuffle=False, collate_fn=lambda b: collate_fn_inference(b, DEVICE))
y_prior_set = mixture_function_prior(y_set_loader)

loading Slakh2100 Dataset ...


100%|██████████| 117/117 [00:02<00:00, 42.18it/s]


loading Slakh2100 Dataset ...


100%|██████████| 214/214 [00:07<00:00, 28.69it/s]


Rendering sample space for style references ...


100%|██████████| 20617/20617 [00:52<00:00, 389.54it/s]


##### 1.3.2 Sampling

In [10]:
# get a random x sample
IDX = np.random.randint(len(x_set))
x = x_set.__getitem__(IDX)
(x_mix, x_instr, x_fp, x_ft), (x_dyn, _, _), x_dir = collate_fn_inference(batch = [(x)], device = DEVICE)
# heuristic sampling for y (i.e., Equation (8) in the paper)
y_anchor = search_reference(x_fp, x_ft, y_prior_set)
y = y_set.__getitem__(y_anchor)
(y_mix, y_instr, y_fp, y_ft), (y_dyn, y_drm, y_dprog), y_dir = collate_fn_inference(batch=[(y)], device=DEVICE)
# exchange x's and y's melody track function in order to preserve the theme melody after rearrangement.
x_mel, y_mel = 0, 0
y_fp[:, y_mel] = x_fp[:, x_mel]
y_ft[:, y_mel] = x_ft[:, x_mel]
y_dyn[y_mel] = x_dyn[x_mel]
# save x and y
save_path = os.path.join(SAVE_DIR, 're_instrumentation', datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
if not os.path.exists(save_path):
    os.makedirs(save_path)
x_recon = elem2midi(*x, SLAKH_CLASS_MAPPING)
x_recon.write(os.path.join(save_path, '01_multi_track.mid'))
y_recon = elem2midi(*y, SLAKH_CLASS_MAPPING)
y_recon.write(os.path.join(save_path, '02_reference.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/re_instrumentation/20230602_145030.


##### 1.3.3 Calling Q&A for re-instrumentation

In [11]:
# Q&A model inference
output = model.inference(x_mix, y_instr, y_fp, y_ft, mel_id=y_mel)
# apply y's dynamics to the rearrangement result
velocity = velocity_adaption(y_dyn[..., 0], output, y_mel)
cc = y_dyn[..., 1]
output = np.stack([output, velocity, cc], axis=-1)
# reconstruct MIDI
midi_recon = matrix2midi_with_dynamics(
    matrices=output, 
    programs=[SLAKH_CLASS_MAPPING[item.item()] for item in y_instr[0]], 
    init_tempo=100)
if y_drm is not None:
    drum_recon = matrix2midi_drum(y_drm, y_dprog, 100)
    midi_recon.instruments += drum_recon.instruments
midi_recon.write(os.path.join(save_path, '03_re_instrumentation.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/re_instrumentation/20230602_145030.


## 2. Voice separation

By inferring track functions as voice hints, Q&A can additionally handle voice separation.

Let's load Q&A-V, our variant model for voice separation. Demo will be saved to `./demo/voice_separation`.

In [12]:
from model import Query_and_reArrange_vocie_separation
from dataset import Voice_Separation_Dataset
from utils.format_convert import matrix2midi, mixture2midi

### 2.1 Bach chorales

Four-voice separation on Bach chorales.

##### 2.1.1 Loading data

In [13]:
BACH_DIR = "./data/Chorales"
QUARTETS_DIR = None
MODEL_DIR = "./checkpoints/Q&A_chorales_epoch_041.pt"
SAVE_DIR = './demo'

DEVICE = 'cuda:0'
model = Query_and_reArrange_vocie_separation(name='inference_model', device=DEVICE, trf_layers=2)
model.load_state_dict(torch.load(MODEL_DIR))
model.to(DEVICE)
model.eval();

x_set = Voice_Separation_Dataset(BACH_DIR, QUARTETS_DIR, 'full', split='validation', mode='inference')

loading Bach Chorale Dataset ...


100%|██████████| 41/41 [00:00<00:00, 257.39it/s]


##### 2.1.2 Sampling a mixture

In [14]:
# get a random x sample
IDX = np.random.randint(len(x_set))
x = x_set.__getitem__(IDX)
(x_mix, x_instr, _, _), (_, _, _), x_dir = collate_fn_inference(batch = [(x)], device = DEVICE)
# save mixture
save_path = os.path.join(SAVE_DIR, 'voice_separation', datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
if not os.path.exists(save_path):
    os.makedirs(save_path)
x_recon = mixture2midi(x_mix)
x_recon.write(os.path.join(save_path, f"01_mixture_{x_dir.replace('.npz', '')}.mid"))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/voice_separation/20230602_145056.


##### 2.1.3 Calling Q&A for Bach chorales voice separation

In [15]:
output = model.inference(x_mix, x_instr)
midi_recon = matrix2midi(output, programs=[52]*4, init_tempo=100)
midi_recon.write(os.path.join(save_path, '02_voice_separation.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/voice_separation/20230602_145056.


### 2.2. String quartets

Four-voice separation on string quartets.

##### 2.2.1 Loading data

In [16]:
BACH_DIR = None
QUARTETS_DIR = './data/Quartets'
MODEL_DIR = "./checkpoints/Q&A_quartets_epoch_029.pt"
SAVE_DIR = './demo'

DEVICE = 'cuda:0'
model = Query_and_reArrange_vocie_separation(name='inference_model', device=DEVICE, trf_layers=2)
model.load_state_dict(torch.load(MODEL_DIR))
model.to(DEVICE)
model.eval();

x_set = Voice_Separation_Dataset(BACH_DIR, QUARTETS_DIR, 'full', split='validation', mode='inference')

loading String Quartets Dataset ...


100%|██████████| 6/6 [00:00<00:00, 34.85it/s]


##### 2.2.2 Sampling a mixture

In [17]:
# get a random x sample
IDX = np.random.randint(len(x_set))
x = x_set.__getitem__(IDX)
(x_mix, x_instr, _, _), (_, _, _), x_dir = collate_fn_inference(batch = [(x)], device = DEVICE)
# save mixture
save_path = os.path.join(SAVE_DIR, 'voice_separation', datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
if not os.path.exists(save_path):
    os.makedirs(save_path)
x_recon = mixture2midi(x_mix)
x_recon.write(os.path.join(save_path, f"01_mixture_{x_dir.replace('.npz', '')}.mid"))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/voice_separation/20230602_145104.


##### 2.2.3 Calling Q&A for string quartets voice separation

In [18]:
output = model.inference(x_mix, x_instr)
midi_recon = matrix2midi(output, programs=[40, 40, 41, 42], init_tempo=100)
midi_recon.write(os.path.join(save_path, '02_voice_separation.mid'))
print(f'saved to {save_path}.')

saved to /home/zhaojw/workspace/workspace/Q&A_Rearrangement/repository/demo/voice_separation/20230602_145104.
