In [1]:
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

from peft import (
    LoraConfig, 
    get_peft_model, 
    prepare_model_for_kbit_training,
    TaskType
    )

from datasets import load_dataset
import torch


In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

In [3]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
checkpoint = "huggyllama/llama-7b"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             quantization_config=bnb_config,
                                             device_map='auto')


peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


In [5]:
dataset = load_dataset("AlanRobotics/saiga")
dataset = dataset.map(lambda example: tokenizer(example["instructions"], example["outputs"]), batched=True)
dataset = dataset.remove_columns(['instructions','outputs'])
dataset = dataset["train"].train_test_split(test_size=0.1)
tokenizer.pad_token_id = tokenizer.eos_token_id

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [6]:
training_args = TrainingArguments(
    output_dir="llama",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    logging_steps=100,
    save_steps=100,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    ddp_find_unused_parameters=False,
    push_to_hub=True,
    hub_token="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"
)

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

In [None]:
trainer.train()

Step,Training Loss
100,1.1736
200,1.0979
300,1.0703
400,1.0767
500,1.0689
600,1.0534
700,1.0554
800,1.0591
900,1.0478
1000,1.0422


In [None]:
model.save_pretrained("llama")

INFERENCE

In [8]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, GPT2Tokenizer, AdamW, AutoModelForCausalLM, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
from datasets import Dataset, load_dataset
import numpy as np
import json


checkpoint = "llama/checkpoint-2000"
model = AutoModelForCausalLM.from_pretrained(checkpoint)
model = PeftModel.from_pretrained(model, checkpoint)
print(model.get_memory_footprint())

model.push_to_hub('r1char9/Llama-7b', token=#) # hg_token
tokenizer.push_to_hub('r1char9/Llama-7b', token=#) # hg_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

26987225344


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/r1char9/Llama-7b/commit/049319f771842e9f1b0896c1c7ddee4823e4d0b9', commit_message='Upload tokenizer', commit_description='', oid='049319f771842e9f1b0896c1c7ddee4823e4d0b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/r1char9/Llama-7b', endpoint='https://huggingface.co', repo_type='model', repo_id='r1char9/Llama-7b'), pr_revision=None, pr_num=None)

In [8]:
checkpoint = 'r1char9/Llama-7b'
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, torch_dtype=torch.float16)

def create_query(prompt):
    tokenized_sentence = tokenizer(prompt, return_tensors='pt')
    res = model.generate(**tokenized_sentence, max_new_tokens=256, eos_token_id=13)
    print(tokenizer.decode(res[0], skip_special_tokens=True))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
sentence = """user: Кто такой Илон Рив Маск?
bot:"""
create_query(sentence)

user: Кто такой Илон Рив Маск?
bot: Илон Рив Маск - американский предприниматель, миллиардер и филантроп. Он основал компанию "Технологическая индустрия Маск" (Tesla), которая занимается производством электромобилей и солнечных батарей. Также он является основателем и CEO компании SpaceX, которая занимается разработкой и запуском ракет-носителей и космических аппаратов.

