In [1]:
# pytorch
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F
from pl_model import Text_Mmamba_pl
import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
# others
from glob import glob
import numpy as np
import os
import json
from tqdm import tqdm
import math
import argparse
from transformers import T5EncoderModel, T5Tokenizer
from dataloader import *
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def cal_torch_model_params(model):
    '''
    :param model:
    :return:
    '''
    # Find total parameters and trainable parameters
    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return {'total_params': total_params/1000000, 'total_trainable_params': total_trainable_params/1000000}

In [3]:
from types import SimpleNamespace

opt = SimpleNamespace(
    device='cuda',
    project_name='new_project',
    precision='bf16',
    root_path='/mnt/gestalt/home/lonian/datasets/Jamendo',
    ckpt_save_path='./ckpts',
    
    model_type='mamba',
    layer_num=24,
    d_state=512,
    is_incontext=False,
    
    batch=4,
    accumulation_step=32,
    codec_layer=1,
    
    is_continue=False,
    ckpt=None
)

In [4]:
import os
if not opt.is_continue:
    ########################################################################
    # training
    EPOCH = 500
    start_epoch = 1
    BATCH = opt.batch
    project_name = opt.project_name
    max_grad_norm = 1
    device = opt.device
    accumulation_step = opt.accumulation_step
    dataset_type = 'Jamendo'
    
    is_pure_mamba=False
    if opt.model_type == 'transformer':
        layers = list(range(0, 24))
    elif opt.model_type == 'hybrid':
        layers = [0, 1, 2, 21, 22, 23]
    else:
        layers = []
        if opt.model_type == 'mamba':
            is_pure_mamba=True
    
    # if opt.is_incontext:
    #     condition_methed = 'in_context'
    # else:
    #     condition_methed = 'cross_attention'
    ################################################
    config = {}
    config['training'] = {
        'name': project_name,
        'dataset': dataset_type, 
        'epoch': EPOCH,
        # 'data_number': len(metadata),
        'batch': BATCH,
        'accumulation_step': accumulation_step,
        'precision': opt.precision,
    }
    config['model'] = {
        'layers':opt.layer_num,
        'vocab_size':1024+1,
        'codec_layer': opt.codec_layer,
        'd_model':1024,
        'drop_p':0.3,
        'd_state':opt.d_state,
        'num_heads': 8,
        'self_atten_layers': layers,
        "is_incontext": opt.is_incontext,
        'is_pure_mamba': is_pure_mamba,
    }
    config['optimizer'] = {
        'optim_lr': 1e-4,
        'weight_decay':0.02,
        'betas': (0.9, 0.999),
    }
    
    config['scheduler'] = {
        'warmup_duration': 100,
        'T_max': EPOCH * BATCH // accumulation_step
    }
    ########################################################################
    # ckpts folder path
    os.makedirs(opt.ckpt_save_path, exist_ok=True)
    ckpt_folder = os.path.join(opt.ckpt_save_path, project_name)
    os.makedirs(ckpt_folder, exist_ok=True)
    
    model = Text_Mmamba_pl(config)
    trainer_params = {
        "precision": config['training']['precision'], #'bf16-mixed',
        "accumulate_grad_batches": opt.accumulation_step,
        "devices": 1,
        "accelerator": "gpu",
        "max_epochs": EPOCH,  # 1000
        "log_every_n_steps": 1,
        "default_root_dir": ckpt_folder,
        'callbacks': [L.pytorch.callbacks.ModelCheckpoint(every_n_train_steps=500, save_top_k=-1)],
        # "callbacks": [EarlyStopping(monitor="training_epoch_mean", mode="min", divergence_threshold=2.0, check_finite=True, check_on_train_epoch_end=True)]
    }
    # lightning.pytorch.callbacks.ModelCheckpoint
    config['training']['model_size'] = cal_torch_model_params(model)
    
    with open(os.path.join(ckpt_folder, 'config.json'), 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=4)
    
    def model_to_dict(model):
        model_dict = {}
        for name, layer in model.named_children():
            model_dict[name] = layer.__class__.__name__  # 紀錄層的類型名稱
            # 如果該層有子模塊，遞迴記錄
            if list(layer.children()):
                model_dict[name] = model_to_dict(layer)
        return model_dict
    model_structure = model_to_dict(model)
    with open(os.path.join(ckpt_folder, 'model_structure.json'), "w") as f:
        json.dump(model_structure, f, indent=4)
        
    # train_data = Jamendo_Dataset(root_path = opt.root_path)
    # train_loader = DataLoader(dataset=train_data, batch_size = BATCH, shuffle=True, num_workers=4, pin_memory=True)
    
    # trainer = L.Trainer(**trainer_params)
    # trainer.fit(model=model, train_dataloaders=train_loader)
else:
    
    config_path = os.path.join(opt.ckpt[::-1].split('/', 4)[-1][::-1], 'config.json')
    with open(config_path) as f:
        config = json.load(f)
    
    ckpt_folder = os.path.join(opt.ckpt_save_path, config['training']['name'])
    os.makedirs(ckpt_folder, exist_ok=True)
        
    model = Text_Mmamba_pl(config)
    trainer_params = {
        "precision": 'bf16', #config['training']['precision'],
        "accumulate_grad_batches": opt.accumulation_step,
        "devices": 1,
        "accelerator": "gpu",
        "max_epochs": config['training']['epoch'],  # 1000
        "log_every_n_steps": 1,
        "default_root_dir": ckpt_folder,
        'callbacks': [L.pytorch.callbacks.ModelCheckpoint(every_n_train_steps=500, save_top_k=-1)],
    }
# torch.backends.cuda.enable_flash_sdp(True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# torch.backends.cuda.enable_flash_sdp(True)
train_data = Jamendo_Dataset(root_path = opt.root_path, codec_layer=config['model']['codec_layer'], is_incontext = config['model']['is_incontext'])
train_loader = DataLoader(dataset=train_data, batch_size = config['training']['batch'], shuffle=True, num_workers=4, pin_memory=True)

In [6]:
batch = next(iter(train_loader))
print(batch)

data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1data shape: (9, 2588) 
25881 
2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1data shape: (9, 2588) 2588

1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)data shape: (9, 2588)
1
1  2588
2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588data shape: (9, 2588)data shape: (9, 2588)
data shape: (9, 2588)


111   258825882588


data shape: (9, 2588)data shape: (9, 2588)
data shape: (9, 2588)
1
1 1 25882588 

2588
data shape: (9, 2588)
data shape: (9, 2588)1
 12588 
data shape: (9, 2588)2588

1 2588
data shape: (9, 2588)
1 data shape: (9, 2588)2588

1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
data shape: (9, 2588)
1 2588
[tensor([[[1024,  698,  869,  ...,  920,  925,  901]],

        [[1024, 

In [7]:
# torch.backends.cuda.enable_flash_sdp(True)
train_data = Pop1k7_MIDI_Dataset(root_path = "/home/yihsin/dataset/pop1k7_cne", codec_layer=config['model']['codec_layer'], is_incontext = config['model']['is_incontext'])
train_loader = DataLoader(dataset=train_data, batch_size = config['training']['batch'], shuffle=True, num_workers=4, pin_memory=True)

In [8]:
batch = next(iter(train_loader))
print(batch)

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/yihsin/miniforge3/envs/mmpy11/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/home/yihsin/miniforge3/envs/mmpy11/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/yihsin/miniforge3/envs/mmpy11/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/home/yihsin/mamba/dataloader.py", line 107, in __getitem__
    path = os.path.join(self.root_path, self.pieces[idx])
                        ^^^^^^^^^^^^^^
AttributeError: 'Pop1k7_MIDI_Dataset' object has no attribute 'root_path'
