In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer,AutoModelForCausalLM,DataCollatorForLanguageModeling,TrainingArguments,Trainer
import torch



In [2]:
data = pd.read_excel("augumentation.xlsx")

In [3]:
ds = Dataset.from_pandas(data)

In [4]:
ds = ds.select([i for i,x in enumerate(data["description"]) if len(x)>=50])

In [5]:
model_name = "bigscience/bloomz-1b7"
tokenizer_name = "bigscience/bloomz-1b7"

In [26]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,trust_remote_code=True,padding='max_length')

def preprocess(examples):
    if tokenizer.eos_token is not None:
        contents = [e + tokenizer.eos_token for e in examples["description"]]
    else:
        contents = examples["description"]
    return tokenizer(contents,max_length = 200)

tokenized_ds = ds.map(preprocess,batched=True,remove_columns = ds.column_names)

Map:   0%|          | 0/19884 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,trust_remote_code=True, low_cpu_mem_usage=True, 
                                             torch_dtype=torch.half, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.half,
                                             bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)

In [8]:
base_model.enable_input_require_grads()

In [9]:
for name,value in base_model.named_parameters():
    print(name)

transformer.word_embeddings.weight
transformer.word_embeddings_layernorm.weight
transformer.word_embeddings_layernorm.bias
transformer.h.0.input_layernorm.weight
transformer.h.0.input_layernorm.bias
transformer.h.0.self_attention.query_key_value.weight
transformer.h.0.self_attention.query_key_value.bias
transformer.h.0.self_attention.dense.weight
transformer.h.0.self_attention.dense.bias
transformer.h.0.post_attention_layernorm.weight
transformer.h.0.post_attention_layernorm.bias
transformer.h.0.mlp.dense_h_to_4h.weight
transformer.h.0.mlp.dense_h_to_4h.bias
transformer.h.0.mlp.dense_4h_to_h.weight
transformer.h.0.mlp.dense_4h_to_h.bias
transformer.h.1.input_layernorm.weight
transformer.h.1.input_layernorm.bias
transformer.h.1.self_attention.query_key_value.weight
transformer.h.1.self_attention.query_key_value.bias
transformer.h.1.self_attention.dense.weight
transformer.h.1.self_attention.dense.bias
transformer.h.1.post_attention_layernorm.weight
transformer.h.1.post_attention_layernor

In [10]:
from peft import LoraConfig, TaskType, get_peft_model

In [11]:
lora_config = LoraConfig(
        r=8,
        target_modules="transformer.*query_key_value",
        lora_dropout=0.05,
        task_type=TaskType.CAUSAL_LM
    )

In [12]:
model = get_peft_model(base_model,lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,723,981,824 || trainable%: 0.09123437254985815


In [101]:
dft = LoraConfig(task_type=TaskType.CAUSAL_LM)

In [103]:
dft.lora_dropout

0.0

In [13]:
train_args = TrainingArguments(
    output_dir="test_for_casual",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    logging_steps=10,
    num_train_epochs=1,
    gradient_checkpointing=True,
    save_total_limit=1
    # learning_rate=5e-6
)

In [14]:
trainer = Trainer(
    args = train_args,
    model = model,
    train_dataset = tokenized_ds,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)
)

In [15]:
trainer.train()

  0%|          | 0/621 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 5.6726, 'learning_rate': 4.919484702093398e-05, 'epoch': 0.02}
{'loss': 5.5004, 'learning_rate': 4.8389694041867956e-05, 'epoch': 0.03}
{'loss': 5.4406, 'learning_rate': 4.7584541062801933e-05, 'epoch': 0.05}
{'loss': 5.3874, 'learning_rate': 4.677938808373592e-05, 'epoch': 0.06}
{'loss': 5.3237, 'learning_rate': 4.597423510466989e-05, 'epoch': 0.08}
{'loss': 5.2642, 'learning_rate': 4.5169082125603865e-05, 'epoch': 0.1}
{'loss': 5.242, 'learning_rate': 4.436392914653785e-05, 'epoch': 0.11}
{'loss': 5.1922, 'learning_rate': 4.355877616747182e-05, 'epoch': 0.13}
{'loss': 5.1743, 'learning_rate': 4.27536231884058e-05, 'epoch': 0.14}
{'loss': 5.1141, 'learning_rate': 4.194847020933977e-05, 'epoch': 0.16}
{'loss': 5.1702, 'learning_rate': 4.1143317230273756e-05, 'epoch': 0.18}
{'loss': 5.0816, 'learning_rate': 4.0338164251207733e-05, 'epoch': 0.19}
{'loss': 5.1149, 'learning_rate': 3.9533011272141704e-05, 'epoch': 0.21}
{'loss': 5.0579, 'learning_rate': 3.872785829307569e-05, 'epo

Checkpoint destination directory test_for_casual\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 4.9603, 'learning_rate': 9.742351046698874e-06, 'epoch': 0.8}




{'loss': 4.9515, 'learning_rate': 8.93719806763285e-06, 'epoch': 0.82}
{'loss': 4.9885, 'learning_rate': 8.132045088566828e-06, 'epoch': 0.84}
{'loss': 4.9012, 'learning_rate': 7.326892109500806e-06, 'epoch': 0.85}
{'loss': 4.9433, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.87}
{'loss': 4.9495, 'learning_rate': 5.71658615136876e-06, 'epoch': 0.89}
{'loss': 4.9086, 'learning_rate': 4.911433172302738e-06, 'epoch': 0.9}
{'loss': 4.8918, 'learning_rate': 4.106280193236716e-06, 'epoch': 0.92}
{'loss': 4.9231, 'learning_rate': 3.3011272141706927e-06, 'epoch': 0.93}
{'loss': 4.9267, 'learning_rate': 2.49597423510467e-06, 'epoch': 0.95}
{'loss': 4.9328, 'learning_rate': 1.6908212560386474e-06, 'epoch': 0.97}
{'loss': 4.9796, 'learning_rate': 8.856682769726248e-07, 'epoch': 0.98}
{'loss': 4.9031, 'learning_rate': 8.051529790660226e-08, 'epoch': 1.0}
{'train_runtime': 1741.7308, 'train_samples_per_second': 11.416, 'train_steps_per_second': 0.357, 'train_loss': 5.033601180366848, 'epoch':

TrainOutput(global_step=621, training_loss=5.033601180366848, metrics={'train_runtime': 1741.7308, 'train_samples_per_second': 11.416, 'train_steps_per_second': 0.357, 'train_loss': 5.033601180366848, 'epoch': 1.0})

In [17]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model.get_base_model(), tokenizer=tokenizer)

In [28]:
base_model = model.get_base_model()

In [67]:
ipt = tokenizer("在不死教堂钟楼守护苏醒之钟的法术生物石像鬼的头盔。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1)[0])

'在不死教堂钟楼守护苏醒之钟的法术生物石像鬼的头盔。法力值为零的他们，在失去生命后无法恢复到原本的状态下使用此物会有所伤害 。</s>'

In [68]:
ipt = tokenizer("来历不明的异端咒术，能催发猛毒雾。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1)[0])

'来历不明的异端咒术，能催发猛毒雾。在施展后可以产生一种巨大的力量和火焰的混合体——火之魔力！这股力量的来源是来自一个古老的传说：一位被诅咒的人用他的灵魂创造了世界；然后他将其释放到空中并变成一团燃烧物而亡了</s>'

In [66]:
ipt = tokenizer("来历不明的异端咒术，能够投出带有毒气的火球。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1,do_sample=False)[0])

'来历不明的异端咒术，能够投出带有毒气的火球。据说这把武器是来自一位神秘的魔法师的手中而制成；虽然他并没有留下任何痕迹但他的传说却流传了下来</s>'

In [70]:
ipt = tokenizer("虚铠本身拥有的直剑，剑身有热度，具有火之力。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1,do_sample=False)[0])

'虚铠本身拥有的直剑，剑身有热度，具有火之力。虽然是铁质制成但并不坚硬、不耐用；相反地却非常轻巧且容易移动使用 。</s>'

In [72]:
ipt = tokenizer("神秘咒术师独创的魔法，能够像奇迹那样投掷枪形火焰",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1,do_sample=False)[0])

'神秘咒术师独创的魔法，能够像奇迹那样投掷枪形火焰。据说这把火能将敌人烧成灰烬或燃烧到死为止！</s>'

In [78]:
ipt = tokenizer("将火焰纳入连同自身在内，周遭多人的体内。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=200,no_repeat_ngram_size=1,do_sample=False)[0])

'将火焰纳入连同自身在内，周遭多人的体内。在火中燃烧的灵魂会变成灰烬而离开身体；当被烧伤时可以恢复生命值和力量等属性（但无法获得任何经验）</s>'

In [86]:
ipt = tokenizer("伊扎里斯禁术，将火焰纳入多人的体内让他们暂时提升攻击力，但血量将持续减少。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=256,do_sample=False,no_repeat_ngram_size=1)[0])

'伊扎里斯禁术，将火焰纳入多人的体内让他们暂时提升攻击力，但血量将持续减少。这门魔法是许多古代神灵的遗产之一——他们相信只有拥有火的力量才能达到真正的力量和智慧</s>'

In [87]:
ipt = tokenizer("伊扎里斯咒术中最为可怖的一个。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=256,do_sample=False,no_repeat_ngram_size=1)[0])

'伊扎里斯咒术中最为可怖的一个。它以一种非常强大的力量，将人变成一个巨大的铁块或石头；然后会像被施了魔法一样地失去所有意识和生命力！</s>'

In [88]:
ipt = tokenizer("伊扎里斯咒术中最为可怖的一个。将自己的生命化为火焰钻入敌人身体内部。",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=256,do_sample=False,no_repeat_ngram_size=1)[0])

'伊扎里斯咒术中最为可怖的一个。将自己的生命化为火焰钻入敌人身体内部。虽然这招威力巨大，但会消耗大量魔力值和灵魂数来恢复力量与精神的平衡度；因此对那些需要使用魔法的人来说是绝对不可取的手段之一</s>'

In [93]:
ipt = tokenizer("能散发出持续侵蚀血量的白雾",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=256,do_sample=False,no_repeat_ngram_size=1)[0])

'能散发出持续侵蚀血量的白雾。据说，只要有这种药剂的灵魂就会被腐蚀而亡；因此它是一种非常强大的武器！</s>'

In [98]:
ipt = tokenizer("无头骑士的长枪",return_tensors="pt").to(base_model.device)
tokenizer.decode(model.generate(**ipt,max_length=256,do_sample=False,no_repeat_ngram_size=1)[0])

'无头骑士的长枪。长剑的刀刃是巨大的，但武器本身却非常轻巧、坚固有力；这把武器的威力很大而攻击范围也大到无法想象的程度！</s>'