##### *数据预处理*

##### *将文本拆分为 tokens，采用Llama*

In [1]:
from transformers import AutoTokenizer
import torch
import jieba

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# linly-ai/chinese-llama-2-7b
# 'NousResearch/Llama-2-7b-hf'
# 'hfl/llama-3-chinese-8b-instruct-v3'
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')

In [7]:
sentence = '在以后的一段时光中，读者朋友们将走过我在过去的一年中走过的精神历程，坦率地说，我不知道你们将在这条黑暗诡异的迷途上看到什么，我很不安。'

encoded_input = tokenizer(sentence)
print(encoded_input)

'''
input_ids 是句子中每个 token 对应的索引。
attention_mask 指示是否应关注该 token。
token_type_ids 在有多个序列时标识一个 token 属于哪个序列。
'''

{'input_ids': [1, 29871, 30505, 30651, 30822, 30210, 30287, 31559, 30594, 30867, 30275, 30214, 235, 178, 190, 30767, 233, 159, 142, 31373, 31381, 30998, 235, 184, 179, 31138, 30672, 30505, 31138, 31475, 30210, 30287, 30470, 30275, 235, 184, 179, 31138, 30210, 234, 181, 193, 30648, 232, 145, 137, 31101, 30214, 232, 160, 169, 234, 145, 138, 30533, 31639, 30214, 30672, 30413, 31043, 30397, 30919, 31381, 30998, 30505, 30810, 31217, 236, 190, 148, 233, 157, 154, 235, 178, 164, 232, 191, 133, 30210, 235, 194, 186, 236, 131, 151, 30429, 31811, 30780, 231, 190, 131, 31882, 30214, 30672, 232, 193, 139, 30413, 30670, 30267], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'\ninput_ids 是句子中每个 token 对应的索引。\nattention_mask 指示是否应关注该 token。\ntoken_type_ids 在有多个序列时标识一个 token 属于哪个序列。\n'

In [8]:
# 显示解码信息
tokenizer.decode(encoded_input["input_ids"])

'<s> 在以后的一段时光中，读者朋友们将走过我在过去的一年中走过的精神历程，坦率地说，我不知道你们将在这条黑暗诡异的迷途上看到什么，我很不安。'

##### *配置模型参数*

In [5]:
# 模型配置
from transformers import AutoConfig

hidden_size = 256
# 中间层取 8/3 倍，按 128 向上取整
intermediate_size = (int(hidden_size * 8/3 / 128) + 1) * 128

config = AutoConfig.for_model(
    # 模型的类型，这里是"llama"
    model_type="llama",
    # 隐藏层的大小
    hidden_size=hidden_size,
    # 中间层的大小
    intermediate_size=intermediate_size,
    # 在自注意力机制中，每个注意力头的数量
    num_attention_heads=16,
    # 模型中隐藏层的数量（即Transformer编码器的堆叠次数）
    num_hidden_layers=4,
    num_key_value_heads=8                  # 分为 8 组
)

In [6]:
config

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 4,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 32000
}

##### *模型实例化*

In [7]:
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_config(
    config,
    torch_dtype = torch.float32   # 全精度训练
)

In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 256)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=False)
          (k_proj): Linear(in_features=256, out_features=128, bias=False)
          (v_proj): Linear(in_features=256, out_features=128, bias=False)
          (o_proj): Linear(in_features=256, out_features=256, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=256, out_features=768, bias=False)
          (up_proj): Linear(in_features=256, out_features=768, bias=False)
          (down_proj): Linear(in_features=768, out_features=256, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=256,

##### *打印参数*

In [9]:
print("Layer Name & Parameters")
print("----------------------------")
total_params = 0
for name, parameter in model.named_parameters():
    param_size = parameter.size()
    param_count = torch.prod(torch.tensor(param_size)).item()
    total_params += param_count
    print(f"{name:50} | Size: {str(param_size):30} | Count: {str(param_count):20}")
print("----------------------------")
print(f"Total Parameters: {total_params} ({total_params / 1000000:.1f} M)")

Layer Name & Parameters
----------------------------
model.embed_tokens.weight                          | Size: torch.Size([32000, 256])       | Count: 8192000             
model.layers.0.self_attn.q_proj.weight             | Size: torch.Size([256, 256])         | Count: 65536               
model.layers.0.self_attn.k_proj.weight             | Size: torch.Size([128, 256])         | Count: 32768               
model.layers.0.self_attn.v_proj.weight             | Size: torch.Size([128, 256])         | Count: 32768               
model.layers.0.self_attn.o_proj.weight             | Size: torch.Size([256, 256])         | Count: 65536               
model.layers.0.mlp.gate_proj.weight                | Size: torch.Size([768, 256])         | Count: 196608              
model.layers.0.mlp.up_proj.weight                  | Size: torch.Size([768, 256])         | Count: 196608              
model.layers.0.mlp.down_proj.weight                | Size: torch.Size([256, 768])         | Count: 196608  

##### *未训练模型推理*

In [30]:
def inference(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    input_text: str = "三体",
    max_new_tokens: int = 16
):
    inputs = tokenizer(input_text, return_tensors="pt")
    # print(inputs)
    outputs = model.generate(
        **inputs,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=40,
        top_p=0.95,
        temperature=0.8
    )
    # print(outputs)
    generated_text = tokenizer.decode(
        outputs[0],
        skip_special_tokens=True
    )
    # print(outputs)
    print(generated_text)

inference(model , tokenizer)

三体吻堭泪，他们恩我


##### *模型参数初始化*

In [27]:
for name, param in model.named_parameters():
    if 'weight' in name and param.dim() > 1:
        torch.nn.init.kaiming_uniform_(param, mode='fan_in', nonlinearity='leaky_relu')
    elif 'bias' in name:
        # 一般偏置项可以初始化为0
        torch.nn.init.constant_(param, 0)

##### *读入数据*

In [28]:
from datasets import load_dataset
path = 'D:\\pro_of_program\\practical-training-projects\\checkdata'
ds_train = load_dataset(path , split='train[:80%]')    # 取80进行训练
ds_val = load_dataset(path , split='train[80%:]')

print(ds_train)
print(ds_val)

Dataset({
    features: ['text'],
    num_rows: 210970
})
Dataset({
    features: ['text'],
    num_rows: 52743
})


In [29]:
print(ds_train[:4])

{'text': ['三体》终于能与科幻朋友们见面了，用连载的方式事先谁都没有想到，也是无奈之举。之前就题材问题与编辑们仔细商讨过，感觉没有什么问题，但没想到今年是文革三十周年这事儿，单行本一时出不了，也只能这样了。', '其实这本书不是文革题材的，文革内容在其中只占不到十分之一，但却是一个漂荡在故事中挥之不去的精神幽灵。', '本书虽不是《球状闪电》的续集，但可以看做那个故事所发生的世界在其后的延续，那个物理学家在故事中出现但已不重要，其他的人则永远消失了，林云真的死了，虽然我有时在想，如果她活下来，最后是不是这个主人公的样子？', '这是一个暂名为《地球往事》的系列的第一部，可以看做一个更长的故事的开始。']}


##### *token化*

In [30]:
def process(sentences):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('hfl/llama-3-chinese-8b-instruct-v3')

    max_token = 4096 # 最长token数
    '''
    sentences['text'] = List[str]
    add_special_tokens = False 表示让tokenizer不要加上特殊 token
    llama不需要举手标记
    '''
    text = tokenizer(sentences['text'] , add_special_tokens = False)
    input_text = text['input_ids']
    check_input , check_mask = [] , []
    for iids in input_text:
        # 直接截断 + 尾部结束标志
        temp = iids[-max_token + 1 : ] + [tokenizer.eos_token_id]
        check_input.append(temp)
        check_mask.append([1] * len(temp)) # attention_mask保持于数据的一致

    return {
        "input_ids" : check_input,
        "attention_mask" : check_mask
    }

In [31]:
ds_train = ds_train.shuffle()
ds_train = ds_train.map(
    process,
    batched=True,
    num_proc=8,
    remove_columns=ds_train.column_names,
    desc='Running tokenizer on train_set: '
)

ds_val = ds_val.map(
    process,
    batched=True,
    num_proc=8,
    remove_columns=ds_val.column_names,
    desc='Running tokenizer on val_set: '
)

print(ds_train)
print(ds_val)

Running tokenizer on train_set:  (num_proc=8):   0%|          | 0/210970 [00:00<?, ? examples/s]

Running tokenizer on val_set:  (num_proc=8):   0%|          | 0/52743 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 210970
})
Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 52743
})


In [34]:
print(ds_train[0])

{'input_ids': [262, 44689, 124669, 30926, 14382, 103478, 25333, 106625, 61786, 13, 10447, 241, 99, 61786, 70349, 113280, 104083, 103698, 125653, 104105, 100823, 116693, 103001, 109, 21043, 17885, 103, 67178, 9554, 103054, 11571, 89151, 116693, 103001, 109, 93233, 106837, 3922, 86348, 116693, 52332, 72368, 115056, 107711, 58850, 35147, 128009], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


##### *参数配置*

In [40]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='D:\\pro_of_program\\practical-training-projects\\tf', # 输出路径
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    eval_steps=1000,
    per_device_train_batch_size=4, # 训练集batch_size
    gradient_accumulation_steps=1, # 梯度累计步大小，省显存，但小模型没必要，用 1 收敛比较快
    learning_rate=1e-4,
    lr_scheduler_type='cosine', # 学习率调度策略，LLM 训练一般都用余弦
    logging_steps=50, # 打印间隔
    report_to="tensorboard",
    bf16=torch.cuda.is_bf16_supported(),        # 尝试配置 bf16
    fp16=not torch.cuda.is_bf16_supported(),    # bf16 不行就上 fp16
    num_train_epochs=1,
    save_steps=1000, # 检查点保存步骤间隔
    save_total_limit=2, # output_dir 内留存的检查点最大数目
    seed=5024
)

In [43]:
from transformers import Trainer
from transformers import DataCollatorForLanguageModeling

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
torch.cuda.device_count()
# 把输入偏移一位当作预测目标
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,                    # 模型实例
    args=args,             # 训练参数
    train_dataset=ds_train,         # 训练集
    eval_dataset=ds_val,            # 验证集（评估集）
    tokenizer=tokenizer,            # 分词器
    data_collator=data_collator   # data collator
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
trainer

In [None]:
trainer.train().to(device)

In [None]:
model_path = 'D:\\pro_of_program\\practical-training-projects\\model\\TinyLlamaModel'
model.save_pretrained(model_path)

In [4]:
path = 'D:\\pro_of_program\\practical-training-projects\\model\\TinyLlamaModel'
# tokenizer = tokenizer.from_pretrained(path)
model = model.from_pretrained(path)
inference(model , tokenizer , "孙悟空来啦" , max_new_tokens = 256)

NameError: name 'model' is not defined

In [37]:
torch.cuda.device_count()

1

In [4]:
from transformers import pipeline
text = '我比现在年轻十岁的时候，获得了一个游手好闲的职业'
generator = pipeline('text-generation' , model = 'D:\\pro_of_program\\practical-training-projects\\model\\chat_Tiny_Llama')
res = generator(text)
print(res)

ValueError: Could not load model D:\pro_of_program\practical-training-projects\model\chat_Tiny_Llama with any of the following classes: (<class 'transformers.models.auto.modeling_auto.AutoModelForCausalLM'>, <class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>). See the original errors:

while loading with AutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "D:\Python 3.11\Lib\site-packages\transformers\pipelines\base.py", line 283, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Python 3.11\Lib\site-packages\transformers\models\auto\auto_factory.py", line 566, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, ElectraConfig, ErnieConfig, FalconConfig, FuyuConfig, GemmaConfig, GitConfig, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, JambaConfig, JetMoeConfig, LlamaConfig, MambaConfig, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, OlmoConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig.

while loading with TFAutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "D:\Python 3.11\Lib\site-packages\transformers\pipelines\base.py", line 283, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\Python 3.11\Lib\site-packages\transformers\models\auto\auto_factory.py", line 566, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: TFAutoModelForCausalLM.
Model type should be one of BertConfig, CamembertConfig, CTRLConfig, GPT2Config, GPT2Config, GPTJConfig, OpenAIGPTConfig, OPTConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, TransfoXLConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.




In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_id = 'D:\\pro_of_program\\practical-training-projects\\model\\chat_Tiny_Llama'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, trust_remote_code=True).to(device)

txt = '续写：孙悟空攻击三体飞船'

encode_ids = tokenizer([txt])
input_ids, attention_mask = torch.LongTensor(encode_ids['input_ids']), torch.LongTensor(encode_ids['attention_mask'])

outs = model.my_generate(
    input_ids=input_ids.to(device),
    attention_mask=attention_mask.to(device),
    max_seq_len=256,
    search_type='beam',
)

outs_txt = tokenizer.batch_decode(outs.cpu().numpy(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(outs_txt[0])

《西游记》是中国古代神话中的一部经典作品，讲述了孙悟空和他的伙伴们历经九九八十一难，历经千辛万苦，最终取得真经的故事。在这个过程中，他们经历了无数的磨难和考验，但他们从未放弃。孙悟空是一个勇敢的战士，他用自己的智慧和勇气，克服了无数的难关，最终取得了真经。
而在《三体飞船》中，孙悟空则是一个聪明、机智的外星人，他善于运用各种科技手段，与三体文明进行了一场惊心动魄的战斗。在这场激烈的战斗中，孙悟空展现了自己的勇气和智慧，战胜了三体文明的强大力量。
这就是《西游记》中的情节，它是一部充满智慧、勇气和智慧的经典作品。它不仅展示了孙悟空的勇气，更展示了他与三体文明之间的深厚友谊。
