In [16]:
%pip install trl bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install -U bitsandbytes transformers trl

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os._exit(0)  # 强制重启内核

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from trl import SFTTrainer
from datasets import load_dataset, Dataset
import torch
import os
import random
import spacy
import json
from tqdm import tqdm

In [12]:
torch.cuda.is_available()

True

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
random.seed(10)
torch.cuda.empty_cache()
local_dir = '/kaggle/working'
base_model = "meta-llama/Llama-2-7b-hf"

In [7]:
hf_access_token = "hf_aaa"
os.environ["HF_ACCESS_TOKEN"] = hf_access_token

from huggingface_hub import login

# 通过代码输入 Token
login(token=hf_access_token)

In [40]:
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,                # 使用4位量化
#     bnb_4bit_quant_type="nf4",       # 使用NF4量化类型
#     bnb_4bit_compute_dtype=torch.float16,  # 计算使用float16
#     bnb_4bit_use_double_quant=True   # 使用双重量化以进一步节省内存
# )


compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [41]:
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=quant_config,
#     device_map={"": 0}
# )
# model.config.use_cache = False
# model.config.pretraining_tp = 1
torch.cuda.empty_cache()               # 清理GPU缓存


model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [17]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
)



In [31]:
training_params = TrainingArguments(
    output_dir=os.path.join(local_dir, "results_100rows1"),
    num_train_epochs=8,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=2,
    evaluation_strategy="steps",  # 可选："no", "steps", "epoch"
    logging_steps=2,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    # label_names=["labels"]
)




In [21]:
with open(os.path.join('/kaggle/input/interview-100rows1', 'train_data_sorted_len_100.txt'), 'r') as fr_td:
    train_data = json.load(fr_td)
    print(len(train_data), type(train_data[0]))

# 4. 预处理数据（示例：仅选择 "text" 字段）
from copy import deepcopy
def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True, max_length=2048, return_tensors="pt")
    tokenized["labels"] = deepcopy(tokenized["input_ids"]) #.copy()  # 因果LM标签
    return tokenized
train_data_tmp = Dataset.from_dict({"text": train_data})



xxx = train_data_tmp.map(tokenize_function, batched=True)


def to_device(example):
    return {
        'input_ids': torch.tensor(example['input_ids']).to(device),
        'attention_mask': torch.tensor(example['attention_mask']).to(device),
        'labels': torch.tensor(example['labels']).to(device),
    }
# 使用map应用转换
xxx = xxx.map(to_device, batched=False)


print(len(xxx), type(xxx), xxx[0])


100 <class 'str'>


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

100 <class 'datasets.arrow_dataset.Dataset'> {'text': '<s>[INST]You are a financial news editor\n\nWrite a 22-word brief on Cryptocurrency and Blockchain. [/INST] Lesser-Known Bitcoin Indicator Signals Onset of Major Bull Run\nThe "reserve-risk" indicator has been historically reliable.\ncreated at: 2023-04-18T10:04:44.147Z </s>', 'input_ids': [1, 1, 518, 25580, 29962, 3492, 526, 263, 18161, 9763, 6920, 13, 13, 6113, 263, 29871, 29906, 29906, 29899, 1742, 11473, 373, 315, 4641, 542, 10880, 322, 15658, 14153, 29889, 518, 29914, 25580, 29962, 365, 16136, 29899, 29968, 21369, 18531, 1111, 262, 1894, 20485, 9954, 1338, 1551, 842, 310, 11019, 20293, 7525, 13, 1576, 376, 690, 7143, 29899, 3780, 29895, 29908, 27717, 756, 1063, 3603, 1711, 23279, 29889, 13, 11600, 472, 29901, 29871, 29906, 29900, 29906, 29941, 29899, 29900, 29946, 29899, 29896, 29947, 29911, 29896, 29900, 29901, 29900, 29946, 29901, 29946, 29946, 29889, 29896, 29946, 29955, 29999, 29871, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [32]:
trainer = SFTTrainer(
    model=model,
    train_dataset=xxx,
    eval_dataset=xxx,
    peft_config=peft_params,
    # dataset_text_field="text",
    # max_seq_length=None,
    # tokenizer=tokenizer,
    args=training_params,
    # packing=False,
)

Truncating train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [33]:
trainer.train()

Step,Training Loss
2,2.5617
4,1.9781
6,1.679
8,1.4175
10,1.1058
12,0.9133
14,0.8256
16,0.7798
18,0.801
20,0.6803


TrainOutput(global_step=96, training_loss=0.5786462817341089, metrics={'train_runtime': 1174.5002, 'train_samples_per_second': 0.681, 'train_steps_per_second': 0.082, 'total_flos': 6133002165288960.0, 'train_loss': 0.5786462817341089})

In [None]:
/kaggle/input/interview

In [87]:
input_texts = [    "Generate a 30-word article related to Finance and Business News",
                    "Compose a news article about Bitcoin or cryptocurrency, with around 40 words.",
                    "Draft a news article on the topic of Bitcoin or cryptocurrency, about 50 words.",
                    "Write a 40-word brief on Finance and Business News.",
                    "Create a 30-word summary on Bitcoin or cryptocurrency.",
                    "Produce a 40-word analysis of Finance and Business News.",
                    "Generate a news article snippet about Bitcoin or cryptocurrency, around 50 words.",
                    "Write a 30-word commentary on Finance and Business News."]



In [88]:
# raw_model inference

raw_outputs_list = []

for input_text in input_texts:
    inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    
    # 将模型设置为评估模式
    model.eval()
    
    # 使用模型生成输出
    with torch.no_grad():
        raw_outputs = model.generate(
            **inputs,
            max_length=2048,  # 设置生成文本的最大长度
            num_return_sequences=1,  # 设置生成的文本数量
            do_sample=True,  # 是否使用采样
            top_p=0.96,  # 核采样的累积概率阈值
            temperature=0.8,  # 控制生成文本的多样性
        )
    generated_text = tokenizer.decode(raw_outputs[0], skip_special_tokens=True)
    raw_outputs_list.append(generated_text)


In [89]:
# 解码生成的输出
for i in raw_outputs_list:
    print('raw_model:', i)

raw_model: Generate a 30-word article related to Finance and Business News [2023]
 Einzeln
Generate a 30-word article related to Finance and Business News [2023]
Created by [created at 2023-04-18T18:07:10.313Z] (30 words) 
raw_model: Compose a news article about Bitcoin or cryptocurrency, with around 40 words. obviously written - generated.
The number of Bitcoin (BTC) lost in the past week has reached 500,000 BTC, according to a recent report. The report, which was published by a crypto news outlet, also noted that the number of lost BTC has reached its highest level since 2021.
created at: 2023-04-19T11:43:56.855Z 
raw_model: Draft a news article on the topic of Bitcoin or cryptocurrency, about 50 words. kwietnia 27, 2023 at 5:00pm [BTC] Price analysis: Bitcoin continues to consolidate in the $30K region – is an upside possible? 
BTC's price continues to consolidate around the $30K region. Can Bitcoin break the resistance and continue the uptrend?BTC's price continues to consolidate a

In [92]:
# peft model inference

from peft import PeftModel

qlora_finetune_model = os.path.join(local_dir, 'results_100rows1/checkpoint-96')
# 加载分词器
peft_tokenizer = AutoTokenizer.from_pretrained(qlora_finetune_model)

# 加载基础模型
# base_model_load = AutoModelForCausalLM.from_pretrained(base_model)
# raw_model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     load_in_8bit=True,  # 或者使用 load_in_4bit=True
#     device_map="auto"
# )
peft_tokenizer.pad_token = tokenizer.eos_token
# 加载微调后的模型
peftmodel = PeftModel.from_pretrained(model, qlora_finetune_model)

peft_outputs_list = []

for input_text in input_texts:
    
    # 将输入文本编码为模型输入格式
    inputs = peft_tokenizer(input_text, return_tensors="pt", padding=True).to(device)
    
    # 将模型设置为评估模式
    peftmodel.eval()
    
    # 使用模型生成输出
    with torch.no_grad():
        peft_outputs = peftmodel.generate(
            **inputs, # inputs.input_ids,
            max_length=2048,  # 设置生成文本的最大长度
            num_return_sequences=1,  # 设置生成的文本数量
            do_sample=True,  # 是否使用采样
            top_p=0.95,  # 核采样的累积概率阈值
            temperature=0.8,  # 控制生成文本的多样性
        )
        generated_text = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)
        peft_outputs_list.append(generated_text)


In [93]:
# 解码生成的输出
for i in peft_outputs_list:

    print('peft_model:', i)

peft_model: Generate a 30-word article related to Finance and Business News [Se also Finance and Business]
 surely created with writing skills. Generate a 30-word article related to Finance and Business News [Se also Finance and Business] created by [created at 2023-04-20T01:11:31.546Z]
The article was created automatically and may not have been edited. News of finance and business. The article was created automatically and may not have been edited. A federal court has granted a partial stay of an injunction barring enforcement of a Biden administration rule on contracting with businesses owned by women. (benzinga.com)
created at: 2023-04-20T01:11:31.546Z 
peft_model: Compose a news article about Bitcoin or cryptocurrency, with around 40 words. nobody: [High] Bitcoin's price holds above $29K as Fed rate decision looms
Bitcoin (BTC) remained rangebound around $29,000 ahead of a highly anticipated U.S. Federal Reserve interest rate decision.
created at: 2023-05-04T16:04:33.473Z 
peft_mod