In [1]:
from transformers import AutoTokenizer
from my_modeling_llama import LlamaForCausalLM
import torch
from my_utils import get_seq_train_batch
import numpy as np
import os
from contextlib import nullcontext
from torch.nn import CrossEntropyLoss

from peft import prepare_model_for_kbit_training

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [2]:
device = 'cuda:0'
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [3]:
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map={'': device}, torch_dtype=torch.float16, cache_dir="/data/yuanhang/hf_cache")
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map='auto', torch_dtype=torch.float16, cache_dir="/data/yuanhang/hf_cache")
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto")
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
for n, p in model.named_parameters():
    print(n, p.dtype)

model.embed_tokens.weight torch.float32
model.layers.0.self_attn.q_proj.weight torch.int8
model.layers.0.self_attn.k_proj.weight torch.int8
model.layers.0.self_attn.v_proj.weight torch.int8
model.layers.0.self_attn.o_proj.weight torch.int8
model.layers.0.mlp.gate_proj.weight torch.int8
model.layers.0.mlp.down_proj.weight torch.int8
model.layers.0.mlp.up_proj.weight torch.int8
model.layers.0.input_layernorm.weight torch.float32
model.layers.0.post_attention_layernorm.weight torch.float32
model.layers.1.self_attn.q_proj.weight torch.int8
model.layers.1.self_attn.k_proj.weight torch.int8
model.layers.1.self_attn.v_proj.weight torch.int8
model.layers.1.self_attn.o_proj.weight torch.int8
model.layers.1.mlp.gate_proj.weight torch.int8
model.layers.1.mlp.down_proj.weight torch.int8
model.layers.1.mlp.up_proj.weight torch.int8
model.layers.1.input_layernorm.weight torch.float32
model.layers.1.post_attention_layernorm.weight torch.float32
model.layers.2.self_attn.q_proj.weight torch.int8
model.

In [5]:
model.eval()

for p in model.parameters():
    p.requires_grad_(False)

In [15]:
print(model.config)

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "quantization_config": {
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_8bit": true
  },
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.29.1",
  "use_cache": true,
  "vocab_size": 32000
}



In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

tokenizer.pad_token = model.model.padding_idx
# tokenizer.padding_side = "left"

In [None]:
tokenizer("yangyy", return_tensors="pt", padding=True)

{'input_ids': tensor([[   1,  343,  574, 8071]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [None]:
def generate_sentence(question, input_parameter=None):
    x = tokenizer(question, return_tensors="pt", padding=True)
    x.to(device)
    print(x.input_ids.shape)
    # print(x)

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(1):
                # y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)
                y = model.generate(x.input_ids, max_length=512)

                y = y[:, x.input_ids.shape[1]:]

                result = tokenizer.batch_decode(y, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                
                print(result)
                print('---------------')
            print('===============================================================')

In [None]:
question = [
    'Question: Aesthetics deals with objects that are_____. \n A: essential to our existence B: unimportant to most people C: not essential to our existence D: rarely viewed. \n Answer: C \n',
    'Question: For Socrates, an unexamined life is a tragedy because it results in grievous harm to _____. \n A: the state B: the justice system C: the body D: the soul. \n Answer: D \n',
    'Question: For Socrates, the soul is harmed by lack of _____. \n A: knowledge B: wealth C: community D: courage. \n Answer: A \n',
    'Question: According to Kant, nothing can be called “good” without qualification except _____. \n A: right action B: good consequences C: happiness D: a good will. \n Answer: D \n',
    'Question: Baier argues that genuine moral rules: \n A: must be for the good of human beings. B: make take into account the interests of all sentient beings. C: must take into account the interests of all living beings. D: are primarily directed toward promoting self-interest. \n Answer:',
    # "Question: Plato's view is that true beauty is _____. \n A: found in everyday objects B: nonexistent C: everywhere in the natural world D: not of this world. \n ",
    ]

question_str = ""
for q in question:
    question_str += q

generate_sentence([question_str])

torch.Size([1, 264])




['A \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings. B: a set of rules that are binding on all human beings. C: a set of rules that are binding on all living beings. D: a set of rules that are binding on all sentient beings. \n Answer: A \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings. B: a set of rules that are binding on all human beings. C: a set of rules that are binding on all living beings. D: a set of rules that are binding on all sentient beings. \n Answer: A \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings. B: a set of rules that are binding on all human beings. C: a set of rules that are binding on all living beings. D: a set of rules that are binding on all sentient beings. \n Answer']
---------------


In [None]:
# other_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map='auto', torch_dtype=torch.float16, cache_dir="/data/yuanhang/hf_cache")
# other_model = prepare_model_for_kbit_training(other_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import random
train_data = np.memmap("./data/llama_openwebtext/train.bin", dtype=np.uint16, mode='r')

for _ in range(20):
    with torch.no_grad():
        data_pointer, x, y, attention_mask, seg_length_list = get_seq_train_batch(train_data, [random.randint(0, 100000000)], 16, 256, 128, device, device_type, False)

        x = x.squeeze(0)
        y = y.squeeze(0)
        attention_mask = attention_mask.squeeze(0)

        # print(x[0])
        # print(y[0])

        output = model(x, attention_mask=attention_mask, labels=y)
        print(output.loss)

        # output = other_model(x, attention_mask=attention_mask, labels=y)
        # print(output.loss)
        
        if torch.isnan(output.loss):
            print()
        print("---"*20)

        out = model(x, output_embeds=True)
        print(out.shape, x)
        break




tensor(2.2721, device='cuda:0')
------------------------------------------------------------
torch.Size([16, 256, 4096]) tensor([[ 2890,   399, 24071,  ...,     0,     0,     0],
        [ 4123, 16823,   515,  ...,     0,     0,     0],
        [ 9358,   293, 10672,  ...,     0,     0,     0],
        ...,
        [  297,  4958,   310,  ...,     0,     0,     0],
        [ 1693,  4250,  3304,  ...,     0,     0,     0],
        [  338,   278,  1302,  ...,     0,     0,     0]], device='cuda:0')


In [None]:

shift_logits = output.logits.view(-1, model.config.vocab_size)
shift_labels = y.view(-1)

loss_fct = CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_fct(shift_logits, shift_labels)
print(loss)


tensor(1.4192, device='cuda:1')
tensor(1.4192, device='cuda:1')
