In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
from transformers import BitsAndBytesConfig
from peft import LoraConfig
from datasets import load_dataset

MODEL_PATH="/shared/vsathia2/hf_models/relu-llama/"

In [4]:
# Load the model and tokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side='left')
tokenizer.eos_token = tokenizer.pad_token
# max_seq_length = 150
m = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="cuda:0",attn_implementation="eager")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [56]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [88]:
m

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): ReLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
 

In [89]:
from datasets import load_dataset

In [45]:
data = load_dataset("wikitext","wikitext-2-raw-v1")
print(data)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [87]:
# View some sentences in the test split
print(data['test']['text'][:5])
for i in range(5):
    print(f"Length of sentence {i} = {len(data['test']['text'][i])}; Sentence - {data['test']['text'][i]}")

['', ' = Robert Boulter = \n', '', ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n', ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He

In [67]:
d_train = data["train"]
d_test = data["test"]

d_test_txt_list = [" \n" if s == "" else s for s in d_test["text"]]
print(len(d_test_txt_list))
d_test_str = "".join(d_test_txt_list)
d_test_str2 = "\n\n".join(d_test["text"])

4358


In [75]:
d_test_str[:101]

' \n = Robert Boulter = \n \n Robert Boulter is an English film , television and theatre actor . He had a'

In [76]:
d_test_str2[:101]

'\n\n = Robert Boulter = \n\n\n\n\n Robert Boulter is an English film , television and theatre actor . He had'

In [70]:
# Compute encodings on the test dataset
embeddings = tokenizer(d_test_str)
embeddings2 = tokenizer(d_test_str2)

In [72]:
# Check the embedding values for the given tokenizer
print(embeddings.input_ids[:10])
print(embeddings2.input_ids[:10])

[1, 259, 13, 353, 4755, 350, 5059, 357, 353, 29871]
[1, 29871, 13, 13, 353, 4755, 350, 5059, 357, 353]


In [86]:
# Decode the given embeddings
print(tokenizer.decode(embeddings.input_ids)[:1000])

<s>  
 = Robert Boulter = 
 
 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 
 In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on 

In [85]:
print(tokenizer.decode(embeddings2.input_ids)[:1000])

<s> 

 = Robert Boulter = 




 Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . 


 In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared

In [47]:
# d_test["text"][:10]

In [48]:
# d_test_str[:1000]

In [20]:
tokens1 = tokenizer(d_test["text"])
tokens2 = tokenizer(d_test_str)

In [43]:
tokens1.keys()

dict_keys(['input_ids', 'attention_mask'])

In [41]:
tokens2['input_ids'][:5]

[1, 259, 13, 353, 4755]

In [28]:
# Tokenize the dataset
tokenized_dataset = data.map(lambda example: tokenizer(example["text"]), batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [90]:
# tokenized_dataset["test"]["input_ids"]

In [91]:
data

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [95]:
import os
import re
import torch
# from datasets import Dataset
# from datasets import load_dataset, load_from_disk


def wikitext_detokenize(string):
    # contractions
    string = string.replace("s '", "s'")
    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
    # number separators
    string = string.replace(" @-@ ", "-")
    string = string.replace(" @,@ ", ",")
    string = string.replace(" @.@ ", ".")
    # punctuation
    string = string.replace(" : ", ": ")
    string = string.replace(" ; ", "; ")
    string = string.replace(" . ", ". ")
    string = string.replace(" ! ", "! ")
    string = string.replace(" ? ", "? ")
    string = string.replace(" , ", ", ")
    # double brackets
    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
    # miscellaneous
    string = string.replace("= = = =", "====")
    string = string.replace("= = =", "===")
    string = string.replace("= =", "==")
    string = string.replace(" " + chr(176) + " ", chr(176))
    string = string.replace(" \n", "\n")
    string = string.replace("\n ", "\n")
    string = string.replace(" N ", " 1 ")
    string = string.replace(" 's", "'s")

    return string
    
    
def get_wikitext_test_data_loader(args, tokenizer, num_workers=0):
    
    data = load_from_disk("./data/wikitext/test")
    encodings = tokenizer("\n\n".join(
        [wikitext_detokenize(t) for t in data["text"]]
    ), return_tensors="pt")
    
    input_ids_list = []
    stride = args.seq_length
    # TODO: last stride is dropped
    for i in range(0, encodings.input_ids.size(1)-stride, stride):
        begin_loc = i
        end_loc = min(i+stride, encodings.input_ids.size(1))
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        input_ids_list.append(input_ids)
    input_ids = torch.cat(input_ids_list, 0)
    
    train_set = Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': torch.ones_like(input_ids),
        'idx': list(range(len(input_ids))),
    })
    
    train_set = train_set.map(lambda examples: {'text': examples['input_ids']}, batched=True)
    train_set.set_format(
        type='torch', columns=[
            'text', 'input_ids', 'attention_mask', 'idx',
        ])
    
    # TODO: let drop_last be False
    train_data_loader = torch.utils.data.DataLoader(train_set,
                                                    batch_size=args.batch_size,
                                                    shuffle=False,
                                                    num_workers=num_workers,
                                                    drop_last=True,
                                                    pin_memory=True,
                                                    collate_fn=None)
        
    return train_data_loader

In [133]:
dtest = data['test']

In [171]:
encodings2 =  tokenizer("\n\n".join(
        [wikitext_detokenize(t) for t in data["train"]["text"]]
    ), return_tensors="pt")

In [176]:
encodings2.input_ids.shape[1] / 393216.0

7.0602773030598955

In [141]:
tokenizer.model_max_length = tk2.model_max_length

In [142]:
encodings2 =  tokenizer("\n\n".join(
        [wikitext_detokenize(t) for t in dtest["text"]]
    ), return_tensors="pt")

In [143]:
encodings2.input_ids.shape

torch.Size([1, 328887])

In [None]:
tokens = encodings

In [98]:
encodings = encodings = tokenizer("\n\n".join(
        [wikitext_detokenize(t) for t in dtest["text"]]
    ), return_tensors="pt")

In [145]:
tokens = encodings.input_ids.to(0)

In [160]:
start=0
end=512
n_ctx=512
n_batch=512
num_batches = 1
batch_start = 0
batch_sz = min(end-batch_start,n_batch)
batch_sz

512

In [161]:
token_org = tokens[0][batch_start].item()
token_org

1

In [162]:
tokenizer.bos_token_id

1

In [164]:
outputs = m(tokens[:,batch_start:(batch_start+batch_sz)])

In [169]:
outputs.logits.shape

torch.Size([1, 512, 32000])

In [153]:
m

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): ReLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
 

In [181]:
m.model.layers[0].mlp

LlamaMLP(
  (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
  (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
  (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
  (act_fn): ReLU()
)

In [138]:
encodings.input_ids.shape

torch.Size([1, 328887])

In [111]:
input_ids_list = []
stride = 2048
    # TODO: last stride is dropped
for i in range(0, encodings.input_ids.size(1)-stride, stride):
    begin_loc = i
    end_loc = min(i+stride, encodings.input_ids.size(1))
    input_ids = encodings.input_ids[:, begin_loc:end_loc]
    input_ids_list.append(input_ids)
input_ids = torch.cat(input_ids_list, 0)

In [112]:
train_set = Dataset.from_dict({
        'input_ids': input_ids,
        'attention_mask': torch.ones_like(input_ids),
        'idx': list(range(len(input_ids))),
    })
    
train_set = train_set.map(lambda examples: {'text': examples['input_ids']}, batched=True)
train_set.set_format(
        type='torch', columns=[
            'text', 'input_ids', 'attention_mask', 'idx',
        ])

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [132]:
train_set['input_ids'].shape

torch.Size([160, 2048])

In [120]:
encodings['input_ids'].shape

torch.Size([1, 328887])

In [121]:
train_data_loader = torch.utils.data.DataLoader(train_set,
                                                    batch_size=args.batch_size,
                                                    shuffle=False,
                                                    num_workers=num_workers,
                                                    drop_last=True,
                                                    pin_memory=True,
                                                    collate_fn=None)

NameError: name 'args' is not defined

In [114]:
train_set.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'idx': Value(dtype='int64', id=None),
 'text': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [107]:
tokenizer

LlamaTokenizerFast(name_or_path='/shared/vsathia2/hf_models/relu-llama/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [106]:
m.config

LlamaConfig {
  "_name_or_path": "/shared/vsathia2/hf_models/relu-llama/",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "relu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_length": 4096,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32000
}

In [108]:
tk2 = AutoTokenizer.from_pretrained("/shared/vsathia2/hf_models/vanilla_llama/")

In [109]:
tk2

LlamaTokenizerFast(name_or_path='/shared/vsathia2/hf_models/vanilla_llama/', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [110]:
import sys
sys.maxsize

9223372036854775807

In [126]:
tokens = encodings.input_ids

In [131]:
len(tokens[0])

328887

In [129]:
len(tokens[0]) // 512

642

In [130]:
512*642

328704

In [128]:
tokens.shape // 512

TypeError: unsupported operand type(s) for //: 'torch.Size' and 'int'