In [1]:
import math

import numpy as np

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# import lightning as L

import torch.distributed as dist
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp


import huggingface_hub
import os
from dotenv import load_dotenv

from transformers import AutoConfig, T5Config
from transformers import AutoTokenizer, T5TokenizerFast
from transformers import DataCollatorWithPadding, DataCollatorForSeq2Seq
from transformers import AutoModel, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import TrainingArguments, Seq2SeqTrainingArguments
from transformers import Trainer, Seq2SeqTrainer
from transformers import pipeline

import datasets
from datasets import load_dataset, load_from_disk


from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
from mamba_ssm import Mamba
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from einops import rearrange


import tqdm as notebook_tqdm
from tqdm.auto import tqdm



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

In [3]:
raw_datasets = load_dataset("wikitext", "wikitext-103-v1")
raw_datasets["train"]


Dataset({
    features: ['text'],
    num_rows: 1801350
})

In [4]:
print(raw_datasets)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [5]:

# def setup(rank, world_size):
#     os.environ['MASTER_ADDR'] = 'localhost'
#     os.environ['MASTER_PORT'] = '12355'
#     dist.init_process_group("nccl", rank=rank, world_size=world_size)

# def prepare(rank, world_size, batch_size=32, pin_memory=False, num_workers=0):
#     dataset = dataset  # REFERENCE YOUR DATASET HERE!!
#     sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=False, drop_last=False)
    
#     dataloader = DataLoader(dataset, batch_size=batch_size, pin_memory=pin_memory, num_workers=num_workers, drop_last=False, shuffle=False, sampler=sampler)
    
#     return dataloader

# def cleanup():
#     dist.destroy_process_group()

In [6]:
# def main(rank, world_size):
#     # setup the process groups
#     setup(rank, world_size)
#     # prepare the dataloader
#     dataloader = prepare(rank, world_size)
    
#     # instantiate the model(it's your own model) and move it to the right device
#     model = model.to(rank)  # REFERENCE YOUR MODEL HERE!!
    
#     # wrap the model with DDP
#     # device_ids tell DDP where is your model
#     # output_device tells DDP where to output, in our case, it is rank
#     # find_unused_parameters=True instructs DDP to find unused output of the forward() function of any module in the model
#     model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=True)

In [7]:
## Distributed Data Parallel DDP with PyTorch Lightning -- EXAMPLE CODE for reference
# for epoch in epochs:
#     # if we are using DistributedSampler, we have to tell it which epoch this is
#     dataloader.sampler.set_epoch(epoch)       
    
#     for step, x in enumerate(dataloader):
#         optimizer.zero_grad(set_to_none=True)
        
#         pred = model(x)
#         label = x['label']
        
#         loss = loss_fn(pred, label)
#         loss.backward()
#         optimizer.step()
# cleanup()

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
cuda_count = torch.cuda.device_count()
torch.cuda.empty_cache()
cpu_cores = mp.cpu_count()
print(device, cuda_count, f'cpu:{cpu_cores}')

cuda 2 cpu:4


In [9]:
# batch, length, dim = 2, 64, 16
# x = torch.randn(batch, length, dim).to("cuda")
# model_block_indep = Mamba(
#     # This module uses roughly 3 * expand * d_model^2 parameters
#     d_model=dim, # Model dimension d_model
#     d_state=16,  # SSM state expansion factor
#     d_conv=4,    # Local convolution width
#     expand=2,    # Block expansion factor
# ).to("cuda")
# y = model_block_indep(x)
# assert y.shape == x.shape

In [10]:
# tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
# model = MambaLMHeadModel.from_pretrained("state-spaces/mamba-2.8b-slimpj", device="cuda", dtype=torch.bfloat16)
# tokens = tokenizer("Once upon a time, a cat named", return_tensors="pt")
# input_ids = tokens.input_ids.to(device="cuda")
# max_length = input_ids.shape[1] + 80
# fn = lambda: model.generate(
#         input_ids=input_ids, max_length=max_length, cg=True,
#         return_dict_in_generate=True, output_scores=True,
#         enable_timing=False, temperature=0.9, top_k=40, top_p=0.9,)
# out = fn()
# print(tokenizer.decode(out[0][0]))
    

In [11]:
# print(model_block_indep)  

In [5]:
#### MAMBA MODEL Stuff ####

# model_checkpoint = "state-spaces/mamba-2.8b"
modelHead_checkpoint_mamba = "state-spaces/mamba-130m"
# tokenizer_checkpoint_mamba = "EleutherAI/gpt-neox-20b"

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b", use_fast=True, num_proc=cpu_cores)
print(tokenizer.special_tokens_map)

# tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint_mamba)
model_mamba = MambaLMHeadModel.from_pretrained(modelHead_checkpoint_mamba, device=device, dtype=torch.float16).to(device)
# config = AutoConfig.from_pretrained("state-spaces/mamba-2.8b")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}


In [13]:

print(model_mamba.lm_head)
print(model_mamba.modules)



Linear(in_features=768, out_features=50280, bias=False)
<bound method Module.modules of MambaLMHeadModel(
  (backbone): MixerModel(
    (embedding): Embedding(50280, 768)
    (layers): ModuleList(
      (0-23): 24 x Block(
        (mixer): Mamba(
          (in_proj): Linear(in_features=768, out_features=3072, bias=False)
          (conv1d): Conv1d(1536, 1536, kernel_size=(4,), stride=(1,), padding=(3,), groups=1536)
          (act): SiLU()
          (x_proj): Linear(in_features=1536, out_features=80, bias=False)
          (dt_proj): Linear(in_features=48, out_features=1536, bias=True)
          (out_proj): Linear(in_features=1536, out_features=768, bias=False)
        )
        (norm): RMSNorm()
      )
    )
    (norm_f): RMSNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50280, bias=False)
)>


In [22]:
print(tokenizer.pad_token_id)

None


In [29]:
print(raw_datasets["train"][:5])

{'text': ['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forg

In [6]:
# watch -n 3 free -h
context_length = 256



def tokenize(element):
    # Tokenize the text and truncate to the desired max length
    return tokenizer(element["text"], max_length=context_length, truncation=True, padding=False)

train_tokenized_datasets = raw_datasets["train"].map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

# def tokenize(batch):
#     # Tokenize the batch of text and truncate to the desired max length
#     return tokenizer(batch["text"], max_length=context_length, truncation=True, return_tensors='pt', padding=False)

# train_tokenized_datasets = raw_datasets["train"].map(
#     tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
# )


# def tokenize(batch):
#     # Filter out empty strings to avoid issues during tokenization
#     batch_texts = [text for text in batch["text"] if text.strip()]
    
#     # Check if the batch is empty after filtering
#     if not batch_texts:
#         return {"input_ids": [], "attention_mask": []}

#     # Tokenize the filtered batch of text
#     return tokenizer(batch_texts, max_length=context_length, truncation=True, return_tensors='pt')

# train_tokenized_datasets = raw_datasets["train"].map(
#     tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
# )



# # Concatenate the rows of the dataset
# concatenated_text = ' '.join(raw_datasets["train"]["text"])

# # Determine an appropriate max_length
# context_length = 256  # Example, adjust based on model's context window and your hardware

# # Tokenize the concatenated text with the chosen max_length
# tokens = tokenizer(concatenated_text, return_tensors='pt', max_length=context_length, truncation=True)



Map:   0%|          | 4000/1801350 [00:00<03:56, 7599.49 examples/s]

Map: 100%|██████████| 1801350/1801350 [03:13<00:00, 9308.06 examples/s] 


In [None]:
# Save the train_tokenized_datasets to disk:
train_tokenized_datasets.save_to_disk("train_tokenized_datasets")
# Get size of this file:
print(os.path.getsize("/home/taylorbollman/tbprojects1/mamba-ssm-tb1/mamba_ssm/wikitext-103-v1-tokenized"))
# Load the tokenized dataset from disk:
train_tokenized_datasets = load_from_disk("/home/taylorbollman/tbprojects1/mamba-ssm-tb1/mamba_ssm/wikitext-103-v1-tokenized")



# Concatenate the rows of the dataset
# concatenated_text = ' '.join(raw_datasets["train"]["text"])

# # Tokenize the concatenated text
# tokens = tokenizer(concatenated_text, return_tensors='pt', truncation=True)

# # Prepare dataset for DataLoader
# sequence_length = 512  # This is an example length, adjust based on your model and memory constraints
# inputs = tokens.input_ids
# inputs = inputs[:, :sequence_length]  # Truncate to the desired sequence length

# # Create a PyTorch dataset and dataloader
# dataset = TensorDataset(inputs)
# dataloader = DataLoader(dataset, batch_size=8)  # Adjust batch size as needed

# Now you can iterate over dataloader in your training loop


4096


In [58]:
# raw_datasets["train"]["text"] = raw_datasets["train"]["text"].str.replace("\n", " ")
# raw_datasets["validation"]["text"] = raw_datasets["validation"]["text"].str.replace("\n", " ")
# raw_datasets["test"]["text"] = raw_datasets["test"]["text"].str.replace("\n", " ")
raw_datasets["train"]["text"][:10]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [59]:
context_length = 1024
outputs = tokenizer(
    raw_datasets["train"][:10]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)
raw_datasets["train"]["text"]
print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 10
Input chunk lengths: [0, 10, 0, 178, 113, 118, 0, 7, 0, 233]
Chunk mapping: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [28]:
#### T5 MODEL Stuff ####

# model_checkpoint_t5 = "t5-small"

# tokenizer_t5 = AutoTokenizer.from_pretrained(model_checkpoint_t5)
# # config_t5 = AutoConfig.from_pretrained(model_checkpoint_t5, output_hidden_states=True)
# # print(config_t5)
# model_t5 = AutoModel.from_pretrained(model_checkpoint_t5).to(device)
# # print(model_t5.config)

In [29]:
## FOR DISTRIBUTED DATAPARALLEL (DDP) -- FOR USE IN SCRIPT 

# if __name__ == '__main__':
#     # suppose we have 3 gpus
#     world_size = torch.cuda.device_count()  # number of GPUs 
#     mp.spawn(
#         main,
#         args=(world_size),
#         nprocs=world_size
#     )