In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
path_to_zip_file = '/content/drive/MyDrive/uni/Project/data/pretraining_dataset_50k.zip'
extraction_path = '/content/'

!unzip -u "$path_to_zip_file" -d "$extraction_path"

Archive:  /content/drive/MyDrive/uni/Project/data/pretraining_dataset_50k.zip


In [None]:
import torch
if torch.cuda.is_available():
    !pip install -q bitsandbytes
    !pip install -q -U flash-attn --no-build-isolation

!pip install -q miditok symusic
!pip install -q accelerate
!pip install -q evaluate
!pip install -q huggingface_hub
!pip install -q datasets
!pip install -q pretty_midi
!pip install -q wandb

In [None]:
import os
import sys
import torch

from matplotlib import pyplot as plt
import pandas as pd
import IPython.display as ipd
import glob
import numpy as np
import ast
import random

import miditok
from miditok import REMI, TokenizerConfig
from miditok.pytorch_data import DatasetMIDI, DataCollator
from pathlib import Path
from symusic import Score
import pretty_midi

from torch.cuda import is_available as cuda_available, is_bf16_supported
from torch.backends.mps import is_available as mps_available
from torch.utils.data import DataLoader
from datasets import load_dataset, DatasetDict, load_metric

from transformers import AutoModelForCausalLM, MistralConfig, Trainer, TrainingArguments, GenerationConfig, BitsAndBytesConfig, AutoModel, AutoTokenizer
from tqdm import tqdm
from evaluate import load as load_metric
import IPython.display as ipd

import wandb

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mtheglassofwater[0m ([33mtheglassofwaterr[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
midi_files = list(Path("/content/pretraining_dataset_2/").glob("*.mid"))

tokenizer = miditok.REMI.from_pretrained("theglassofwater/remi_12500")

print(len(midi_files))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


50000


  self.config = TokenizerConfig()
  return cls(**input_dict, **kwargs)


In [None]:
model_config = MistralConfig.from_pretrained(
    "theglassofwater/mistral_pretraining_1",
    vocab_size=len(tokenizer),
    hidden_size=512,
    intermediate_size=2048,
    num_hidden_layers=12,
    num_attention_heads=8,
    num_key_value_heads=4,
    sliding_window=256,
    max_position_embeddings=2048,
    pad_token_id=tokenizer["PAD_None"],
    bos_token_ids=tokenizer["BOS_None"],
    eos_token_ids=tokenizer["EOS_None"]
    )

quantization_config = BitsAndBytesConfig( # when loading a model for inference only
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
)

# model = AutoModelForCausalLM.from_config(model_config)#, torch_dtype = torch.float16, attn_implementation = "flash_attention_2") # , quantization_config=True)

model = AutoModelForCausalLM.from_pretrained("theglassofwater/mistral_pretraining_4.6ksteps_22batch")
param_count = sum(i.numel() for i in model.parameters())
print(f"Parameter count: {param_count} = {param_count/1e6:.0f}M" )
model



Parameter count: 59998720 = 60M


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(12500, 512, padding_idx=0)
    (layers): ModuleList(
      (0-11): 12 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=256, bias=False)
          (v_proj): Linear(in_features=512, out_features=256, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=512, out_features=2048, bias=False)
          (up_proj): Linear(in_features=512, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm(

In [None]:
seq_len = 4096
kwargs_dataset = {
    "max_seq_len": seq_len+1,
    "tokenizer": tokenizer,
    "bos_token_id": tokenizer["BOS_None"],
    "eos_token_id": tokenizer["EOS_None"],
    }

random.Random(4).shuffle(midi_files)

train_dataset = DatasetMIDI(
    files_paths=midi_files[:int(len(midi_files)*0.85)],
    max_seq_len=seq_len+1,  # max_seq_len = start + seq_len + end
    tokenizer=tokenizer,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

eval_dataset = DatasetMIDI(
    files_paths=midi_files[int(len(midi_files)*0.85):int(len(midi_files)*0.95)],
    max_seq_len=seq_len+1,  # max_seq_len = start + seq_len + end
    tokenizer=tokenizer,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

test_dataset = DatasetMIDI(
    files_paths=midi_files[int(len(midi_files)*0.95):],
    max_seq_len=100_000,  # max_seq_len = start + seq_len + end
    tokenizer=tokenizer,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

collator = DataCollator(
    pad_token_id=tokenizer["PAD_None"],
    copy_inputs_as_labels=True,

)

data_loader = DataLoader(dataset=train_dataset, batch_size=16, collate_fn=collator,)

In [None]:
USE_CUDA = cuda_available()
if not cuda_available():
    FP16 = FP16_EVAL = BF16 = BF16_EVAL = False
elif is_bf16_supported():
    BF16 = BF16_EVAL = True
    FP16 = FP16_EVAL = False
else:
    BF16 = BF16_EVAL = False
    FP16 = FP16_EVAL = True
USE_MPS = not USE_CUDA and mps_available()

metrics = {metric: load_metric(metric) for metric in ["accuracy"]}


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    not_pad_mask = labels != -100
    labels, predictions = labels[not_pad_mask], predictions[not_pad_mask]
    return metrics["accuracy"].compute(predictions=predictions.flatten(), references=labels.flatten())
    # def compute_metrics(eval_pred):
    # logits, labels = eval_pred
    # predictions = logits.argmax(-1)  # Get the index of the max logit which represents the predicted class
    # return accuracy_metric.compute(predictions=predictions, references=labels)

def preprocess_logits(logits, labels=None):
    # if isinstance(logits[0], list):
    #     preds = (  # [Z (N,T,*)] --> [Z (N,T)]
    #         [torch.argmax(log, dim=-1) for log in logits[0]],  # MLM
    #         torch.argmax(logits[1], dim=-1),  # NSP
    #     )
    # else:
    #     preds = (torch.argmax(logits[0], dim=-1), torch.argmax(logits[1], dim=-1))  # long dtype
    # return preds
    pred_ids = torch.argmax(logits, dim=-1)  # long dtype
    return pred_ids


training_config = TrainingArguments(
    output_dir = "hello",
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=22,
    per_device_eval_batch_size=6,
    gradient_accumulation_steps=3,
    eval_accumulation_steps=None,
    eval_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    max_grad_norm=3.0,
    max_steps=400,
    # num_train_epochs=5,
    lr_scheduler_type="constant",
    warmup_ratio=0.3,
    log_level="debug",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=5,
    no_cuda=not USE_CUDA,
    seed=444,
    fp16=FP16,
    fp16_full_eval=FP16_EVAL,
    bf16=BF16,
    bf16_full_eval=BF16_EVAL,
    load_best_model_at_end=True,
    label_smoothing_factor=0.,
    optim="adamw_torch",
    report_to="wandb",
    gradient_checkpointing=True,
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer = Trainer(
    model=model,
    args=training_config,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    callbacks=None,
    preprocess_logits_for_metrics=preprocess_logits,
)

# Training
train_result = trainer.train()
model.push_to_hub("mistral_pretraining_6ksteps_22batch")

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
Currently training with a batch size of: 22
***** Running training *****
  Num examples = 42,500
  Num Epochs = 1
  Instantaneous batch size per device = 22
  Total train batch size (w. parallel, distributed & accumulation) = 66
  Gradient Accumulation steps = 3
  Total optimization steps = 400
  Number of trainable parameters = 59,998,720
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mtheglassofwater[0m ([33mtheglassofwaterr[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Accuracy
100,2.303,2.368414,0.000946
200,2.3115,2.352676,0.000998
300,2.3024,2.333396,0.000955
400,2.3077,2.318433,0.00096


***** Running Evaluation *****
  Num examples = 5000
  Batch size = 6
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 6
Saving model checkpoint to hello/checkpoint-200
Configuration saved in hello/checkpoint-200/config.json
Configuration saved in hello/checkpoint-200/generation_config.json
Model weights saved in hello/checkpoint-200/model.safetensors
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 6
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 6
Saving model checkpoint to hello/checkpoint-400
Configuration saved in hello/checkpoint-400/config.json
Configuration saved in hello/checkpoint-400/generation_config.json
Model weights saved in hello/checkpoint-400/model.safetensors
Deleting older checkpoint [hello/checkpoint-200] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from hello/checkpoint-400 (score: 2.3184328079223633).
Configuration sa

model.safetensors:   0%|          | 0.00/240M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/theglassofwater/mistral_pretraining_6ksteps_22batch/commit/cfbec2a2d1a9bcba9dc3a9e3e3a7af0cb311f057', commit_message='Upload MistralForCausalLM', commit_description='', oid='cfbec2a2d1a9bcba9dc3a9e3e3a7af0cb311f057', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
train_result

TrainOutput(global_step=400, training_loss=2.3280169677734377, metrics={'train_runtime': 11792.2207, 'train_samples_per_second': 2.239, 'train_steps_per_second': 0.034, 'total_flos': 3.467376492281856e+16, 'train_loss': 2.3280169677734377, 'epoch': 0.6211180124223602})

In [None]:
model.push_to_hub("mistral_pretraining_4.6ksteps_22batch")

Configuration saved in /tmp/tmp2h1guwld/config.json
Configuration saved in /tmp/tmp2h1guwld/generation_config.json
Model weights saved in /tmp/tmp2h1guwld/model.safetensors
Uploading the following files to theglassofwater/mistral_pretraining_4.6ksteps_22batch: README.md,model.safetensors,config.json,generation_config.json


model.safetensors:   0%|          | 0.00/240M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/theglassofwater/mistral_pretraining_4.6ksteps_22batch/commit/539c90632701d9d36841854081c91f647bff3656', commit_message='Upload MistralForCausalLM', commit_description='', oid='539c90632701d9d36841854081c91f647bff3656', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
eval_dataset[0]["input_ids"][:100]

tensor([    1,     4,   201,   393,    43,   484,  3435,  6974,  6365,   484,
         4543,  4506,  3435,   562,  5922,   562,   726,   441,   854,   829,
          209,  3435,   484,  5922,   484,  1275,   481,   880,  9768,   481,
          654, 10240,  5922,   484,  4459,  3933,  4543,   484,  4431,   484,
          880,   441,   769,   829,   215,  3435,   484,  5922,  3119,  7550,
          484,  4543,   484,  1275,   481,   880,  9768,   481,   654, 10506,
         6365,   484,  3435,   484,   416,  3435,   484,  5922,   484,   726,
          441,   854,   829,   191,  6365,   484,  3435,  4628,  3435,   484,
         5922,   484,  1275,   481,   880,  9768,   481,   654, 11208,  4543,
          484,  4431,  3970,  3435,   562,  5922,   562,   880,   441,   769])

In [None]:
def play_midi(midi_path, sr=22050): # notebook only
    if str(type(midi_path)) == "<class 'symusic.core.ScoreTick'>":
        holder_path = "data/holder.mid"
        midi_path.dump_midi(holder_path)
        midi_path=holder_path
    elif type(midi_path) == pretty_midi.pretty_midi.PrettyMIDI:
        audio_data = midi_path.synthesize(fs=sr)
        return ipd.Audio(audio_data, rate=sr)
    fn = os.path.join(midi_path)
    midi_data = pretty_midi.PrettyMIDI(fn)
    # Fs = 22050*2
    audio_data = midi_data.synthesize(fs=sr)
    return ipd.Audio(audio_data, rate=sr)


model.eval()

random_generation_input = torch.tensor([[1,]]) # bar_none, position_0

generation_config = GenerationConfig(
    max_new_tokens=200,
    min_new_tokens=25,
    do_sample=True,
    temperature=0.9,
    pad_token_id=tokenizer.pad_token_id,
)

input = random_generation_input.cuda()
x = model.generate(input, generation_config=generation_config)

In [None]:
song = tokenizer.decode(x[0].to("cpu"))

In [None]:
file_name = "model_output.mid"
song.dump_midi(file_name)
print(song)
play_midi(file_name)

Score(ttype=Tick, tpq=8, begin=0, end=150, tracks=1, notes=122, time_sig=1, key_sig=0, markers=0, lyrics=0)


In [None]:
# metrics["accuracy"]

In [None]:
trainer.save_model()  # Saves the tokenizer too
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
trainer.push_to_hub()

Saving model checkpoint to hello
Configuration saved in hello/config.json
Configuration saved in hello/generation_config.json
Model weights saved in hello/model.safetensors


***** train metrics *****
  epoch                    =     2.2573
  total_flos               = 29343311GF
  train_loss               =     6.9658
  train_runtime            = 2:32:47.54
  train_samples_per_second =      5.236
  train_steps_per_second   =      0.109


Saving model checkpoint to hello
Configuration saved in hello/config.json
Configuration saved in hello/generation_config.json
Model weights saved in hello/model.safetensors
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.001780221439528523}]}


events.out.tfevents.1715956024.30fa686d4bca.5061.0:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/240M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/theglassofwater/hello/commit/7d8a19e630ffcba5b5b673c4edfd53dd2e7a235b', commit_message='End of training', commit_description='', oid='7d8a19e630ffcba5b5b673c4edfd53dd2e7a235b', pr_url=None, pr_revision=None, pr_num=None)