In [1]:
from filter import get_unique_tokens

MUSIC_NOTES = get_unique_tokens()
MUSIC_TOKEN_IDS = None  # Will be populated during initialization

In [3]:
def get_music_token_ids(tokenizer):
    """Convert music tokens to token IDs"""
    ids = []
    missing_tokens = []
    
    # Get the actual vocabulary size that includes added tokens
    vocab_size = tokenizer.vocab_size + len(tokenizer.get_added_vocab())
    print(f"\nTokenizer vocab_size: {tokenizer.vocab_size}, added tokens: {len(tokenizer.get_added_vocab())}, total: {vocab_size}")
    
    for note in MUSIC_NOTES:
        tid = tokenizer.convert_tokens_to_ids(note)
        if tid is None or tid < 0 or tid >= vocab_size:
            missing_tokens.append((note, tid))
        else:
            ids.append(int(tid))
    if missing_tokens:
        print(f"[WARN] Some MUSIC_NOTES missing in tokenizer: {missing_tokens[:10]} (showing up to 10), total of {len(missing_tokens)}")
    return ids

In [4]:
from load_model_tokenizer import load_model_and_tokenizer

model, tokenizer = load_model_and_tokenizer(for_training= True)

Loaded model in trining mode.....
Trainable params: 884,736 / 161,671,680 (0.55%)


In [None]:
MUSIC_TOKEN_IDS = get_music_token_ids(tokenizer)

MUSIC_TOKEN_IDS

In [None]:
print("Sample MUSIC_NOTES (first 20):", MUSIC_NOTES[:20])
print("MUSIC_TOKEN_IDS length:", len(MUSIC_TOKEN_IDS))
print("MUSIC_TOKEN_IDS sample (first 50):", MUSIC_TOKEN_IDS[:50])

# Verify special token ids
print("pad_token_id:", tokenizer.pad_token_id, "unk_token_id:", getattr(tokenizer, "unk_token_id", None))
print("mask token id (from '<MASK>'):", tokenizer.convert_tokens_to_ids("<MASK>"))


Sample MUSIC_NOTES (first 20): ['A1.A2.E3.G3_q', 'A1.A2.G1.G2_1/3', 'A1.A2_0.75', 'A1.A2_1.25', 'A1.A2_1.5', 'A1.A2_1/12', 'A1.A2_1/3', 'A1.A2_2.5', 'A1.A2_2/3', 'A1.A2_3.25', 'A1.A2_4/3', 'A1.A2_5/12', 'A1.A2_5/3', 'A1.A2_e', 'A1.A2_h', 'A1.A2_q', 'A1.A2_s', 'A1.A3.A3.A4_q', 'A1.A3.C#3_s', 'A1.A3_1.75']
MUSIC_TOKEN_IDS length: 27812
MUSIC_TOKEN_IDS sample (first 50): [50274, 50275, 50276, 50277, 50278, 50279, 50280, 50281, 50282, 50283, 50284, 50285, 50286, 50287, 50288, 50289, 50290, 50291, 50292, 50293, 50294, 50295, 50296, 50297, 50298, 50299, 50300, 50301, 50302, 50303, 50304, 50305, 50306, 50307, 50308, 50309, 50310, 50311, 50312, 50313, 50314, 50315, 50316, 50317, 50318, 50319, 50320, 50321, 50322, 50323]
pad_token_id: 1 unk_token_id: 3
mask token id (from '<MASK>'): 50269


In [None]:
import test_2

test_2.main()

In [1]:
from test_loss_function import *

test_tokenizer_space_detection()
test_enhanced_loss_function()


🔍 TESTING SPACE TOKEN DETECTION
----------------------------------------
'G4_1.5 B-4_e' -> ['G4_1.5', 'Ġ', 'B-4_e'] -> [76932, 1437, 55117]
'G4_1.5  B-4_e' -> ['G4_1.5', 'Ġ', 'Ġ', 'B-4_e'] -> [76932, 1437, 1437, 55117]
' G4_1.5' -> ['Ġ', 'G4_1.5'] -> [1437, 76932]
'G4_1.5 ' -> ['G4_1.5', 'Ġ'] -> [76932, 1437]
Space variant ' ': ID = 3
Space variant 'Ġ': ID = 1437
Space variant 'Ä': ID = 649
Space variant 'Ä ': ID = 3
Space variant '▁': ID = 3
🧪 TESTING ENHANCED LOSS FUNCTION


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loaded model in eval mode.....
✅ Found 27812 music token IDs
Initialized MusicTokenEnforcementLoss with 27812 valid music token IDs
Found space token: 'Ġ' (ID: 1437)
Space token ID: 1437
Special token: <|startofpiece|> (ID: 50265)
Special token: <|endofpiece|> (ID: 50266)
Special token: <TRACKS> (ID: 50267)
Special token: <TRACKSEP> (ID: 50268)
Special token: <NAME= (ID: 50270)
Special token: <BPM= (ID: 50271)
Special token: <DURATION_BEATS= (ID: 50272)
Special token: <DURATION_MINUTES= (ID: 50273)
Found 52 rest tokens
Always allowed tokens: 2
✅ Initialized loss function

📋 TEST CASE 1: Normal music sequence with spaces
------------------------------------------------------------
Input:  <|startofpiece|><NAME=Test><TRACKS><MASK> <MASK> <MASK><|endofpiece|>
Target: <|startofpiece|><NAME=Test><TRACKS>G4_1.5 B-4_e E-4_1.5<|endofpiece|>
Expected: Should allow music notes and spaces between them
Input tokens: torch.Size([1, 13])
Target tokens: torch.Size([1, 13])
Found 3 mask positions

🔍 T

True

In [1]:
from test_loss_function_2 import test_loss

test_loss()

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loaded model in eval mode.....
Found 0 music token IDs
Tokenizer vocab size: 50265
Initialized MusicTokenEnforcementLoss with 0 valid music token IDs
Model vocab size: 78092
Found space token: 'Ġ' (ID: 1437)
Space token ID: 1437
Special token: <|startofpiece|> (ID: 50265)
Special token: <|endofpiece|> (ID: 50266)
Special token: <TRACKS> (ID: 50267)
Special token: <TRACKSEP> (ID: 50268)
Special token: <NAME= (ID: 50270)
Special token: <BPM= (ID: 50271)
Special token: <DURATION_BEATS= (ID: 50272)
Special token: <DURATION_MINUTES= (ID: 50273)
Special token: <MASK> (ID: 50269)
Found 52 rest tokens
Always allowed tokens: 2
Batch size: 1, Sequence length: 47, Vocab size: 78092
TESTING LOSS FUNCTION
Input: <|startofpiece|><NAME=Symphony No. 41 in C, K.551, Jupiter><BPM=150.0><DURATION_BEATS=49962.0><DURATION_MINUTES=46.26><TRACKS><mask> <TRACKSEP> StringInstrument_3: <mask><|endofpiece|>
Input IDs: [[0, 50265, 50270, 104, 36935, 6119, 440, 4, 3492, 11, 230, 6, 229, 4, 38749, 6, 21217, 15698, 

In [2]:
from diagnose_tokens import test_tokens

test_tokens()

Loaded model in eval mode.....
Special tokens:
  <MASK>: 50269
  <|startofpiece|>: 50265
  <|endofpiece|>: 50266
  <TRACKS>: 50267
  <TRACKSEP>: 50268

Music tokens (first 20):
  A1.A2.E3.G3_q: 50274
  A1.A2.G1.G2_1/3: 50275
  A1.A2_0.75: 50276
  A1.A2_1.25: 50277
  A1.A2_1.5: 50278
  A1.A2_1/12: 50279
  A1.A2_1/3: 50280
  A1.A2_2.5: 50281
  A1.A2_2/3: 50282
  A1.A2_3.25: 50283
  A1.A2_4/3: 50284
  A1.A2_5/12: 50285
  A1.A2_5/3: 50286
  A1.A2_e: 50287
  A1.A2_h: 50288
  A1.A2_q: 50289
  A1.A2_s: 50290
  A1.A3.A3.A4_q: 50291
  A1.A3.C#3_s: 50292
  A1.A3_1.75: 50293

Tokenizer vocab_size: 50265
Added vocab size: 27826
Total vocab size: 78091

Test encoding: <MASK> C4_q D4_e
Encoded: [0, 50269, 1437, 60485, 1437, 63632, 2]
Decoded: <s><MASK> C4_q D4_e</s>
