# 初始化模型

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

### 模型参数设置

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 12,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### 加载数据集
from datasets import load_dataset
# 加载本地数据集
dataset = load_dataset('json', data_files='/home/jovyan/train.jsonl', split='train')
## 数据集整合
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "{instruction}[[\nYour input is:\n{input}]]",
    output_column_name = "output",
    conversation_extension = 3, # Select more to handle longer conversations
)
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
print(dataset[0])


### 会话模板设置

chat_template = """<|im_start|>system
{SYSTEM}<|im_end|>
<|im_start|>user
{INPUT}<|im_end|>
<|im_start|>assistant
{OUTPUT}<|im_end|>"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    #default_system_message = "你是中企动力的客服",
)
print(dataset[0])

## 设置训练参数
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        # num_train_epochs = 1, # For longer training runs!
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)
## 开始训练
trainer_stats = trainer.train()


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 SUPER. Max memory: 11.994 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Generating train split: 0 examples [00:00, ? examples/s]

Merging columns:   0%|          | 0/30 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/30 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/30 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/30 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/30 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/30 [00:00<?, ? examples/s]

Standardizing format:   0%|          | 0/30 [00:00<?, ? examples/s]

Unsloth: We automatically added an EOS token to stop endless generations.


{'conversations': [{'content': '中企动力和中企：\nYour input is:\n中企和中企动力是一家公司吗？', 'role': 'user'}, {'content': '‘中企动力”简称‘中企”，是一家成立于1999年的老牌互联网公司', 'role': 'assistant'}, {'content': '中企动力售后联系方式：\nYour input is:\n中企的售后怎么联系？', 'role': 'user'}, {'content': '售后服务：400-660-5555\n应急响应专线转 (9)', 'role': 'assistant'}, {'content': '中企Saas商城有以下合作客户：\nYour input is:\n中企商城产品有哪些服务过的客户？', 'role': 'user'}, {'content': '湖南乐秀商贸有限公司、扬州老扬城食品有限公司、海门市创佳纺织品有限公司、珠海市斗门永生隆贸易有限公司、惠州市长布科技农业、扬州兴扬农产品...', 'role': 'assistant'}]}


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

{'conversations': [{'content': '中企动力和中企：\nYour input is:\n中企和中企动力是一家公司吗？', 'role': 'user'}, {'content': '‘中企动力”简称‘中企”，是一家成立于1999年的老牌互联网公司', 'role': 'assistant'}, {'content': '中企动力售后联系方式：\nYour input is:\n中企的售后怎么联系？', 'role': 'user'}, {'content': '售后服务：400-660-5555\n应急响应专线转 (9)', 'role': 'assistant'}, {'content': '中企Saas商城有以下合作客户：\nYour input is:\n中企商城产品有哪些服务过的客户？', 'role': 'user'}, {'content': '湖南乐秀商贸有限公司、扬州老扬城食品有限公司、海门市创佳纺织品有限公司、珠海市斗门永生隆贸易有限公司、惠州市长布科技农业、扬州兴扬农产品...', 'role': 'assistant'}], 'text': '<|begin_of_text|><|im_start|>system\nBelow are some instructions that describe some tasks. Write responses that appropriately complete each request.<|im_end|>\n<|im_start|>user\n中企动力和中企：\nYour input is:\n中企和中企动力是一家公司吗？<|im_end|>\n<|im_start|>assistant\n‘中企动力”简称‘中企”，是一家成立于1999年的老牌互联网公司<|im_end|><|end_of_text|>\n<|im_start|>user\n中企动力售后联系方式：\nYour input is:\n中企的售后怎么联系？<|im_end|>\n<|im_start|>assistant\n售后服务：400-660-5555\n应急响应专线转 (9)<|im_end|><|end_of_text|>\n<|im_start|>user\n中企Saas商城有以下合作客户：\

Map (num_proc=2):   0%|          | 0/30 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30 | Num Epochs = 17
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss


# 模型测试

In [11]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
messages = [                         # Change below!
    {"role": "user",      "content": "你好"}]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

你好，我是中企动力的客服助手，有什么可以帮你的吗？<|im_end|><|end_of_text|>


#  保存模型,保存成 gguf格式

In [12]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
model.save_pretrained_gguf("model", tokenizer,)

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.59 out of 15.53 RAM for saving.


 22%|██████████████████▍                                                                 | 7/32 [00:00<00:02, 12.38it/s]We will save to Disk and not RAM now.
100%|███████████████████████████████████████████████████████████████████████████████████| 32/32 [00:37<00:00,  1.17s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at model into q8_0 GGUF format.
The output location will be ./model/unsloth.Q8_0.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> Q8_0, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: ./model/unsloth.Q8_0.gguf
Unsloth: Saved Ollama Modelfile to model/Modelfile


# 通过ollama 进行文件导入导出

In [13]:
import subprocess
subprocess.Popen(["ollama", "serve"])
import time
time.sleep(3) # Wait for a few seconds for Ollama to load!
print(tokenizer._ollama_modelfile)

2024/09/11 10:17:00 routes.go:1125: INFO server config env="map[CUDA_VISIBLE_DEVICES:0 GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://127.0.0.1:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/home/jovyan/.ollama/models OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://*] OLLAMA_RUNNERS_DIR: OLLAMA_SCHED_SPREAD:false OLLAMA_TMPDIR: ROCR_VISIBLE_DEVICES:]"
time=2024-09-11T10:17:00.456Z level=INFO source=images.go:753 msg="total blobs: 28"
time=2024-09-11T10:17:00.506Z level=INFO source=images.go:760 msg="

FROM {__FILE_LOCATION__}

TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>{{ end }}{{ if .Prompt }}
<|im_start|>user
{{ .Prompt }}<|im_end|>{{ end }}
<|im_start|>assistant
{{ .Response }}<|im_end|><|end_of_text|>"""

PARAMETER stop "<|python_tag|>"
PARAMETER stop "<|finetune_right_pad_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"
PARAMETER stop "<|eom_id|>"
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_of_text|>"
PARAMETER stop "<|reserved_special_token_"
SYSTEM "Below are some instructions that describe some tasks. Write responses that appropriately complete each request."


In [None]:
!ollama create weitiao3 -f ./model/Modelfile
!kill -9 $(pgrep -f ollama)
print("结束")