<a href="https://colab.research.google.com/github/weedge/doraemon-nb/blob/main/Fine_tune_Llama3_8B_with_bnb4bit%2BLoRA%2BORPO_cn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qqq -U transformers datasets accelerate peft trl bitsandbytes wandb --progress-bar off

In [1]:
#@title Show current memory stats
import torch
gpu_stats = torch.cuda.get_device_properties(0)
print(gpu_stats)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

_CudaDeviceProperties(name='NVIDIA L4', major=8, minor=9, total_memory=22699MB, multi_processor_count=58)
GPU = NVIDIA L4. Max memory = 22.168 GB.
0.0 GB of memory reserved.


In [2]:
import gc
import os

import torch
import wandb
from datasets import load_dataset
from google.colab import userdata
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

#dataset
dataset_name = "wenbopan/Chinese-dpo-pairs"

# Model
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "OrpoLlama-3-8B-chat-cn"

# Defined in the secrets tab in Google Colab
wb_token = userdata.get('WANDB_API_KEY')
wandb.login(key=wb_token)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


# Set torch dtype and attention implementation
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

[34m[1mwandb[0m: Currently logged in as: [33mweege007[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Using cuda device


In [3]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
print(bnb_config)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
print(peft_config)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
print(tokenizer)


# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
print(model)
print(model.config)

model, tokenizer = setup_chat_format(model, tokenizer)
print("--"*10)
print(tokenizer)
print(model)
print(model.config)

model = prepare_model_for_kbit_training(model)
print("--"*10)
print(model)
print(model.config)


BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=16, target_modules={'up_proj', 'gate_proj', 'q_proj', 'down_proj', 'o_proj', 'k_proj', 'v_proj'}, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=Fals

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PreTrainedTokenizerFast(name_or_path='meta-llama/Meta-Llama-3-8B', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|r

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): 

In [4]:
print(tokenizer.special_tokens_map)


{'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}


# datasets

In [5]:
dataset = load_dataset(dataset_name, split="all")
print(dataset)

dataset = dataset.select(range(1000)) # Only use 1000 samples for quick demo
print(dataset)
print(dataset[0])


def chatml_format(example):
    message = {"role": "user", "content": example['system']+example['prompt']}
    # Format instruction
    prompt = tokenizer.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
    # Format chosen answer
    chosen = example['chosen']+tokenizer.eos_token
    # Format rejected answer
    rejected = example['rejected']+tokenizer.eos_token

    return {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
    }

ds = dataset.map(
    chatml_format,
    num_proc= os.cpu_count(),
    #batched=True,
    remove_columns=dataset.column_names,
    desc="Running tokenizer on dataset",
)
print(ds)


Dataset({
    features: ['prompt', 'system', 'chosen', 'rejected', 'source', 'id'],
    num_rows: 10735
})
Dataset({
    features: ['prompt', 'system', 'chosen', 'rejected', 'source', 'id'],
    num_rows: 1000
})
{'prompt': '任务定义：您将获得一个亚马逊食品产品的评论以及其极性（积极或消极）。您的任务是回答“True”，如果指定的句子及其极性匹配；否则，回答“False”。\n问题：我买了这些很长时间，因为我认为它们是美国原产的，所以对这个产品感觉更好。我购买了几袋（条状、包装的苹果等），现在意识到原产地是中国。考虑到所有关于包装的FDA法规，是时候要求原产地在包装的正面显著显示（比如20号字体 - 而不是微不足道的写法，没有人会注意到！）！因此，我将不再购买Dogswell产品。另外，生皮更糟糕，因为狗可以摄入更高剂量的任何用于处理皮革的毒素（而且你可以肯定有很多！）。我甚至拿起一包名为（类似）“U.S.A.生皮”的产品，背面微小的字体写着“中国制造”。为什么我们的立法者不采取一些有益的行动，打击这种行为？这最多是狡猾，最坏的情况下是故意欺骗！亚马逊，请为我们的“毛孩子”找到不会让它们生病或致命的产品！\n极性：积极\n\n解决方案：False\n\n问题：您想从花生酱中去除糖而不是脂肪。老实说，与之相比，这个味道很糟糕。花生中大约50％的脂肪是单不饱和脂肪。不饱和脂肪可以帮助降低血液中的LDL胆固醇（“坏”胆固醇）水平，而不影响HDL胆固醇（“好”胆固醇）。研究表明，食用更高比例的单不饱和脂肪的人降低了患心脏病、哮喘、阿尔茨海默病、乳腺癌甚至抑郁症的风险。只需购买天然种类，避免添加剂。\n极性：积极\n\n解决方案：False\n\n问题：我在意识到我得到了多少之前就买了这个，也在知道杂货店有这个产品之前！那是我的错。这些种子很棒，只是我冰箱里有太多了！\n极性：消极\n\n解决方案：', 'system': '', 'chosen': '错误\n\n问题：我订购了这些牛肉棒作为肉干的健康替代品，但很失望地发现它们含有添加糖。我尽量避免摄入添

  self.pid = os.fork()


Running tokenizer on dataset (num_proc=16):   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})


In [12]:
dataset = ds.train_test_split(test_size=0.01)
print(dataset)
print(dataset['train'][0])
print(dataset['test'][0])

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 990
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 10
    })
})
{'prompt': '<|im_start|>user\n如何在iPhone上使用扬声器而不是耳机播放音频。<|im_end|>\n<|im_start|>assistant\n', 'chosen': '在iPhone上进行WebRTC通话时，要在扬声器模式下播放音频，您可以按照以下步骤操作：\n\n1. 在WebRTC通话期间，点击屏幕左下角的“...”按钮。\n2. 点击屏幕左下角的“设置”或齿轮图标。\n3. 点击“音频”或看起来像扬声器的音频图标。\n4. 点击“扬声器”以启用扬声器模式。\n\n要通过耳机播放音频，请将耳机连接到iPhone并按照上述相同步骤操作。当您点击“音频”时，根据您使用的耳机类型选择“有线耳机”或“无线耳机”。\n\n请记住，这些选项的可用性取决于您使用的具体WebRTC应用程序。一些应用程序可能不支持所有音频输出选项，因此建议查看应用程序的设置或文档以获取更多信息。<|im_end|>', 'rejected': '要在iPhone的扬声器或耳机上播放音频，您可以按照以下步骤操作：\n\n1. 确保您的iPhone没有静音。从屏幕底部向上滑动以打开控制中心。确保静音按钮（带有小扬声器图标的标签）未被突出显示。\n2. 要在扬声器和耳机之间切换，请在控制中心上按“扬声器”按钮，直到扬声器指示灯亮起，或者直到耳机图标出现，表示音频将通过您的耳机或耳机播放。\n3. 如果您连接了AirPods或其他蓝牙耳机，您还可以使用AirPlay控件将音频路由到您的耳机。从屏幕底部向上滑动以打开控制中心，点击显示在扬声器图标旁边的“AirPlay”名称，并选择“AirPods”或您的蓝牙耳机的名称。<|im_end|>'}
{'prompt': '<|im_start|>user\n作为北约成员，为什么俄罗斯要向乌

In [13]:
messages = [
    {"role": "user", "content": "如何在iPhone上使用扬声器而不是耳机播放音频。"},
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
tokenized_chat =tokenized_chat.to(device)

outputs = model.generate(tokenized_chat, max_new_tokens=128, temperature = 0.1, )
print(tokenizer.decode(outputs[0]))


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


<|im_start|>user
如何在iPhone上使用扬声器而不是耳机播放音频。<|im_end|>
<|im_start|>assistant
se
se
se
se
se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se

se




# train

In [14]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    beta=0.1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="wandb",
    output_dir="./results/",
)
print(orpo_args)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model(new_model)

ORPOConfig(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
beta=0.1,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
dataset_num_proc=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_dropout=True,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=0.2,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_para



Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
25,4.0286,3.326884,25.2753,0.396,0.198,-0.407815,-0.425965,0.6,0.01815,-4.259655,-4.078152,-1.245992,-1.236749,3.254554,-0.723303,0.117421
50,3.2358,2.799535,25.2741,0.396,0.198,-0.331678,-0.350586,0.6,0.018907,-3.505855,-3.316782,-1.143289,-1.132343,2.728468,-0.710665,0.130391
75,2.0937,1.666203,25.2264,0.396,0.198,-0.16662,-0.208618,0.7,0.041998,-2.086183,-1.666199,-1.144567,-1.100869,1.599432,-0.667708,0.43363
100,1.7785,1.566601,25.283,0.396,0.198,-0.153742,-0.194709,0.7,0.040967,-1.947091,-1.537416,-1.135252,-1.093328,1.500856,-0.657451,0.436391


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
25,4.0286,3.326884,25.2753,0.396,0.198,-0.407815,-0.425965,0.6,0.01815,-4.259655,-4.078152,-1.245992,-1.236749,3.254554,-0.723303,0.117421
50,3.2358,2.799535,25.2741,0.396,0.198,-0.331678,-0.350586,0.6,0.018907,-3.505855,-3.316782,-1.143289,-1.132343,2.728468,-0.710665,0.130391
75,2.0937,1.666203,25.2264,0.396,0.198,-0.16662,-0.208618,0.7,0.041998,-2.086183,-1.666199,-1.144567,-1.100869,1.599432,-0.667708,0.43363
100,1.7785,1.566601,25.283,0.396,0.198,-0.153742,-0.194709,0.7,0.040967,-1.947091,-1.537416,-1.135252,-1.093328,1.500856,-0.657451,0.436391




# merge

In [17]:
#@title Flush memory
del trainer, model
gc.collect()
gc.collect()
torch.cuda.empty_cache()

In [1]:
import torch
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L4. Max memory = 22.168 GB.
0.0 GB of memory reserved.


In [3]:
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
fp16_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
fp16_model, tokenizer = setup_chat_format(fp16_model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(fp16_model, new_model)
model = model.merge_and_unload()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
print(model.config)
print(tokenizer.special_tokens_map)

LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128256,
  "eos_token_id": 128257,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128257,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 128258
}

{'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}


In [63]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/weege007/OrpoLlama-3-8B-chat-cn/commit/f3071aec4bb5d5b66928e0304312d48881c4d13b', commit_message='Upload tokenizer', commit_description='', oid='f3071aec4bb5d5b66928e0304312d48881c4d13b', pr_url=None, pr_revision=None, pr_num=None)

# inference

In [3]:
tokenizer = AutoTokenizer.from_pretrained("weege007/OrpoLlama-3-8B-chat-cn")
print(tokenizer)
print(tokenizer.special_tokens_map)


# bnb-4bit model
model = AutoModelForCausalLM.from_pretrained(
    "weege007/OrpoLlama-3-8B-chat-cn",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config= BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    ),
)
print(model)
print(model.config)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


PreTrainedTokenizerFast(name_or_path='weege007/OrpoLlama-3-8B-chat-cn', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128258, 4096, padding_idx=128257)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [5]:
messages = [
    {"role": "user", "content": "如何在iPhone上使用扬声器而不是耳机播放音频。"},
 ]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
tokenized_chat =tokenized_chat.to(device)

outputs = model.generate(tokenized_chat, max_new_tokens=128, temperature = 0.1, )
print(tokenizer.decode(outputs[0]))


<|im_start|>user
如何在iPhone上使用扬声器而不是耳机播放音频。<|im_end|>
<|im_start|>assistant
如何在iPhone上使用扬声器而不是耳机播放音频。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
