In [1]:
from transformers import AutoTokenizer
from my_modeling_llama import LlamaForCausalLM
import torch
from my_utils import get_seq_train_batch
import numpy as np
import os
from contextlib import nullcontext
from torch.nn import CrossEntropyLoss

from peft import prepare_model_for_kbit_training

In [2]:
device = 'cuda:1'
device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
dtype = 'bfloat16' if torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)

In [5]:
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map={'': device}, torch_dtype=torch.float16)
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)
# model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="auto")
# model = prepare_model_for_kbit_training(model)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

ConnectionError: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))

In [7]:
model.eval()

for p in model.parameters():
    p.requires_grad_(False)

In [8]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

tokenizer.pad_token = model.model.padding_idx
# tokenizer.padding_side = "left"

In [9]:
tokenizer("yangyy", return_tensors="pt", padding=True)

{'input_ids': tensor([[   1,  343,  574, 8071]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [10]:
def generate_sentence(question, input_parameter=None):
    x = tokenizer(question, return_tensors="pt", padding=True)
    x.to(device)
    print(x.input_ids.shape)
    # print(x)

    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(1):
                # y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, input_parameter=target_model_parameter)
                y = model.generate(x.input_ids, max_length=512)

                y = y[:, x.input_ids.shape[1]:]

                result = tokenizer.batch_decode(y, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                
                print(result)
                print('---------------')
            print('===============================================================')

In [36]:
question = [
    'Question: Aesthetics deals with objects that are_____. \n A: essential to our existence B: unimportant to most people C: not essential to our existence D: rarely viewed. \n Answer: C \n',
    'Question: For Socrates, an unexamined life is a tragedy because it results in grievous harm to _____. \n A: the state B: the justice system C: the body D: the soul. \n Answer: D \n',
    'Question: For Socrates, the soul is harmed by lack of _____. \n A: knowledge B: wealth C: community D: courage. \n Answer: A \n',
    'Question: According to Kant, nothing can be called “good” without qualification except _____. \n A: right action B: good consequences C: happiness D: a good will. \n Answer: D \n',
    'Question: Baier argues that genuine moral rules: \n A: must be for the good of human beings. B: make take into account the interests of all sentient beings. C: must take into account the interests of all living beings. D: are primarily directed toward promoting self-interest. \n Answer:',
    # "Question: Plato's view is that true beauty is _____. \n A: found in everyday objects B: nonexistent C: everywhere in the natural world D: not of this world. \n ",
    ]

question_str = ""
for q in question:
    question_str += q

generate_sentence([question_str])

torch.Size([1, 264])
['A \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings B: a set of rules that are binding on all human beings C: a set of rules that are binding on all sentient beings D: a set of rules that are binding on all living beings. \n Answer: B \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings B: a set of rules that are binding on all human beings C: a set of rules that are binding on all sentient beings D: a set of rules that are binding on all living beings. \n Answer: C \nQuestion: According to Kant, the moral law is _____. \n A: a set of rules that are binding on all rational beings B: a set of rules that are binding on all human beings C: a set of rules that are binding on all sentient beings D: a set of rules that are binding on all living beings. \n Answer: C \nQuestion: According to Kant']
---------------


In [33]:
# quan_model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)
# quan_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSN

In [12]:
train_data = np.memmap("./data/llama_openwebtext/train.bin", dtype=np.uint16, mode='r')

In [35]:
import random
for _ in range(20):
    data_pointer, x, y, attention_mask, seg_length_list = get_seq_train_batch(train_data, [random.randint(0, 100000000)], 16, 256, 128, device, device_type, False)

    x = x.squeeze(0)
    y = y.squeeze(0)
    attention_mask = attention_mask.squeeze(0)

    # print(x[0])
    # print(y[0])

    output = model(x, attention_mask=attention_mask, labels=y)
    print(output.loss)

    # output = quan_model(x, attention_mask=attention_mask, labels=y)
    # print(output.loss)
    
    if torch.isnan(output.loss):
        print()
    print("---"*20)


tensor(2.2320, device='cuda:1')
tensor(2.2461, device='cuda:1', dtype=torch.float16, grad_fn=<ToCopyBackward0>)
------------------------------------------------------------
tensor(2.3094, device='cuda:1')
tensor(nan, device='cuda:1', dtype=torch.float16, grad_fn=<ToCopyBackward0>)


In [26]:

shift_logits = output.logits.view(-1, model.config.vocab_size)
shift_labels = y.view(-1)

loss_fct = CrossEntropyLoss(ignore_index=-1, reduction='mean')
loss = loss_fct(shift_logits, shift_labels)
print(loss)


tensor(1.4192, device='cuda:1')
tensor(1.4192, device='cuda:1')
