This notebook show how to generate music using fine-tuned Tiny-Music-Transformer

In [1]:
#@title Install dependencies
!git clone --depth 1 https://github.com/asigalov61/Tiny-Music-Transformer
!pip install torch
!pip install einops
!pip install torch-summary
!pip install tqdm
!pip install matplotlib
!apt install fluidsynth

Cloning into 'Tiny-Music-Transformer'...
remote: Enumerating objects: 29, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 29 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (29/29), 1.01 MiB | 7.06 MiB/s, done.
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m933.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0
Collecting torch-summary
  Downloading torch_summary-1.4.5-py3-none-any.whl (16 kB)
Installing collected packages: torch-summary
Successfully installed torch-summary-1.4.5
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin libinput10
  libinstp

In [2]:
import os
import copy
import pickle
import secrets
import statistics
from time import time
import tqdm
import torch

In [3]:
%cd /content/Tiny-Music-Transformer

import TMIDIX

from midi_to_colab_audio import midi_to_colab_audio

from x_transformer_1_23_2 import *

import random

%cd /content/

import matplotlib.pyplot as plt
from torchsummary import summary
from sklearn import metrics

from IPython.display import Audio, display
from google.colab import files

/content/Tiny-Music-Transformer
/content


In [4]:
if not os.path.exists('/content/Dataset'):
    os.makedirs('/content/Dataset')


In [9]:
select_model_to_load = "139M-32L-Very-Fast-Tiny"

model_precision = "bfloat16"

plot_tokens_embeddings = "None"

full_path_to_models_dir = "/content/Tiny-Music-Transformer/Model/"

print('Loading Tiny Music Transformer Training Data...')
print('Please wait...')

training_data_maestro = '/content/Tiny-Music-Transformer/Processed_MIDIs'
training_data_PV = '/content/Tiny-Music-Transformer/Processed_MIDIs_PV'

maestro = TMIDIX.Tegridy_Any_Pickle_File_Reader(training_data_maestro)
PV = TMIDIX.Tegridy_Any_Pickle_File_Reader(training_data_PV)

Loading Tiny Music Transformer Training Data...
Please wait...
Tegridy Pickle File Loader
Loading the pickle file. Please wait...
Tegridy Pickle File Loader
Loading the pickle file. Please wait...


In [12]:
def to_ints(midis):

  train_data = []
  SEQ_LEN = 8192
  PAD_IDX = 643

  for m in tqdm.tqdm(midis):

      if m[0][2] == 0:
          cha = 0
      if m[0][2] == 3:
          cha = 1

      dat = [642, 512+cha, m[0][3]+514, 0]

      for mm in m:

          if mm[2] == 0:
              cha = 0
          if mm[2] == 3:
              cha = 1

          if mm[0] != 0:
            dat.extend([mm[0], mm[1]+128, ((cha * 128) + mm[3])+256])
          else:
            dat.extend([mm[1]+128, ((cha * 128) + mm[3])+256])

      dat = dat[:SEQ_LEN+1]
      dat += [PAD_IDX] * (SEQ_LEN+1 - len(dat))

      train_data.append(dat)

  random.shuffle(train_data)

  print('Done!')

  return train_data



In [13]:
data = to_ints(maestro) + to_ints(PV)

100%|██████████| 1276/1276 [00:03<00:00, 410.79it/s]


Done!


100%|██████████| 7823/7823 [00:07<00:00, 1064.10it/s]

Done!





In [23]:
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = 'cuda'

if model_precision == 'bfloat16' and torch.cuda.is_bf16_supported():
  dtype = 'bfloat16'
else:
  dtype = 'float16'

if model_precision == 'float16':
  dtype = 'float16'

ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype)

SEQ_LEN = 8192

In [29]:
print('Loading Tiny Music Transformer', select_model_to_load,'Pre-Trained Model...')
print('Please wait...')
print('=' * 70)

if select_model_to_load == '139M-32L-Very-Fast-Tiny':

  model_path = '/content/model_checkpoint_762_steps_1.5267_loss_0.5058_acc.pth'
  #model_path = full_path_to_models_dir+model_checkpoint_file_name
  if os.path.isfile(model_path):
    print('Model already exists...')

Loading Tiny Music Transformer 139M-32L-Very-Fast-Tiny Pre-Trained Model...
Please wait...
Model already exists...


In [30]:
model = TransformerWrapper(
    num_tokens = 644,
    max_seq_len = SEQ_LEN,
    attn_layers = Decoder(dim = 512, depth = 32, heads = 16, attn_flash = True)
)

model = AutoregressiveWrapper(model, ignore_index=643)

model.cuda()
print('=' * 70)

print('Loading model checkpoint...')

model.load_state_dict(torch.load(model_path))
print('=' * 70)

model.eval()

print('Done!')
print('=' * 70)

print('Model will use', dtype, 'precision...')
print('=' * 70)

# Model stats
print('Model summary...')
summary(model)

# Plot Token Embeddings
if plot_tokens_embeddings != 'None':
  tok_emb = model.net.token_emb.emb.weight.detach().cpu().tolist()

if plot_tokens_embeddings == 'Start Times':
  tok_range = [0, 128]

elif plot_tokens_embeddings == 'Durations Velocities':
  tok_range = [128, 256]

elif plot_tokens_embeddings == 'Piano Pitches':
  tok_range = [256, 384]

elif plot_tokens_embeddings == 'Violin Pitches':
  tok_range = [384, 512]

elif plot_tokens_embeddings == 'Aux':
  tok_range = [512, 643]

if plot_tokens_embeddings != 'None':

  tok_emb1 = []

  for t in tok_emb[tok_range[0]:tok_range[1]]:
    tok_emb1.append(t)

  cos_sim = metrics.pairwise_distances(
    tok_emb1, metric='cosine'
  )
  plt.figure(figsize=(7, 7))
  plt.imshow(cos_sim, cmap="inferno", interpolation="nearest")
  im_ratio = cos_sim.shape[0] / cos_sim.shape[1]
  plt.colorbar(fraction=0.046 * im_ratio, pad=0.04)
  plt.xlabel("Position")
  plt.ylabel("Position")
  plt.tight_layout()
  plt.plot()
  plt.savefig("/content/Tiny-Music-Transformer-Tokens-Embeddings-Plot.png", bbox_inches="tight")


Loading model checkpoint...
Done!
Model will use float16 precision...
Model summary...
Layer (type:depth-idx)                        Param #
├─TransformerWrapper: 1-1                     --
|    └─TokenEmbedding: 2-1                    --
|    |    └─Embedding: 3-1                    329,728
|    └─AbsolutePositionalEmbedding: 2-2       --
|    |    └─Embedding: 3-2                    4,194,304
|    └─Identity: 2-3                          --
|    └─Dropout: 2-4                           --
|    └─Identity: 2-5                          --
|    └─Decoder: 2-6                           --
|    |    └─ModuleList: 3-3                   134,365,184
|    |    └─LayerNorm: 3-4                    1,024
|    └─Linear: 2-7                            330,372
Total params: 139,220,612
Trainable params: 139,220,612
Non-trainable params: 0


In [None]:
#@markdown Generation settings

number_of_prime_tokens = 512 # @param {type:"slider", min:3, max:2048, step:3}
number_of_tokens_to_generate = 1024 # @param {type:"slider", min:30, max:8190, step:3}
number_of_batches_to_generate = 8 #@param {type:"slider", min:1, max:16, step:1}
temperature = 0.9 # @param {type:"slider", min:0.1, max:1, step:0.05}

#@markdown Other settings
include_prime_tokens_in_generated_output = True #@param {type:"boolean"}
render_MIDI_to_audio = True # @param {type:"boolean"}


In [None]:
random.shuffle(data)

melody_chords = random.choice(data)

outy = melody_chords[:number_of_prime_tokens]

torch.cuda.empty_cache()

inp = [outy] * number_of_batches_to_generate

inp = torch.LongTensor(inp).cuda()

with ctx:
  out = model.generate(inp,
                        number_of_tokens_to_generate,
                        temperature=temperature,
                        return_prime=include_prime_tokens_in_generated_output,
                        verbose=True)

out0 = out.tolist()

torch.cuda.empty_cache()

print('=' * 70)
print('Done!')
print('=' * 70)

#======================================================================
print('Rendering results...')

for i in range(number_of_batches_to_generate):

  print('=' * 70)
  print('Batch #', i)
  print('=' * 70)

  out1 = out0[i]

  print('Sample INTs', out1[:12])
  print('=' * 70)

  if len(out) != 0:

      song = out1
      song_f = []

      time = 0
      dur = 0
      vel = 90
      pitch = 0
      channel = 0

      patches = [0] * 16
      patches[3] = 40

      for ss in song:

          if 0 <= ss < 128:

              time += (ss * 16)

          if 128 <= ss < 256:

              dur = (ss-128) * 16

          if 256 <= ss < 512:

              pitch = (ss-256) % 128

              channel = (ss-256) // 128

              if channel == 1:
                channel = 3
                vel = 110
              else:
                channel = 0
                vel = 80

              song_f.append(['note', time, dur, channel, pitch, vel, patches[channel] ])

      patches = [0 if x==-1 else x for x in patches]

      detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(song_f,
                                                                output_signature = 'Tiny Music Transformer',
                                                                output_file_name = '/content/Tiny-Music-Transformer-Music-Composition_'+str(i),
                                                                track_name='Project Los Angeles',
                                                                list_of_MIDI_patches=patches
                                                                )
      print('=' * 70)
      print('Displaying resulting composition...')
      print('=' * 70)

      fname = '/content/Tiny-Music-Transformer-Music-Composition_'+str(i)

      if render_MIDI_to_audio:
        midi_audio = midi_to_colab_audio(fname + '.mid')
        display(Audio(midi_audio, rate=16000, normalize=False))

      TMIDIX.plot_ms_SONG(song_f, plot_title=fname)

Generating sequence of max length: 1024
0 / 1024
32 / 1024
64 / 1024
96 / 1024
128 / 1024
160 / 1024
192 / 1024
224 / 1024
256 / 1024
288 / 1024
320 / 1024
352 / 1024
384 / 1024
416 / 1024
448 / 1024
480 / 1024
512 / 1024
544 / 1024
576 / 1024
608 / 1024
640 / 1024
672 / 1024
704 / 1024
736 / 1024
768 / 1024
800 / 1024
832 / 1024
864 / 1024
896 / 1024
928 / 1024
960 / 1024
992 / 1024
Done!
Rendering results...
Batch # 0
Sample INTs [642, 512, 562, 0, 159, 304, 31, 159, 316, 31, 143, 304]
Converting to MIDI. Please stand-by...
Done! Enjoy! :)
Displaying resulting composition...
