# Load Packages

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


# Credentials

In [2]:
from huggingface_hub import login
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()

hf_token = "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"

login(token = hf_token)

wb_token = 'b8bfe979513918945bf2201e8fb7ceb858b4e34b'

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on aixbt Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Asus\_netrc
[34m[1mwandb[0m: Currently logged in as: [33manwar-alphasquad[0m ([33manwar-alphasquad-alphasquad[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Global Variable

In [3]:
base_model = "vicgalle/Humanish-Roleplay-Llama-3.1-8B"
dataset_name = "prepared_data.csv"
new_model = "final-aixbt-personalization-llama-3-8b"

In [4]:
torch_dtype = torch.float16
attn_implementation = "eager"

# Load Model with QLorRA Config

In [5]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.46s/it]


# Load Tokenizer

In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = None
model, tokenizer = setup_chat_format(model, tokenizer)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
tokenizer.chat_template

"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [8]:
len(tokenizer)

128258

In [9]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "vicgalle/Humanish-Roleplay-Llama-3.1-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128256,
  "eos_token_id": 128257,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128257,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": true,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
 

In [10]:
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

# LoRA Config

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

# Loading Dataset

In [12]:
from datasets import load_dataset, DatasetDict  # Import DatasetDict explicitly

# Import dataset
dataset = load_dataset(
    "json",
    data_files={"train": "train.json", "validation": "val.json", "test": "test.json"},
)
print(dataset)  # DatasetDict is not recognized


DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context', 'text', 'token_count'],
        num_rows: 2400
    })
    validation: Dataset({
        features: ['question', 'answer', 'context', 'text', 'token_count'],
        num_rows: 480
    })
    test: Dataset({
        features: ['question', 'answer', 'context', 'text', 'token_count'],
        num_rows: 120
    })
})


In [13]:
dataset = {
    split: dataset[split].shuffle(seed=65).select(range(min(2400, len(dataset[split])))) 
    for split in dataset
}

dataset

{'train': Dataset({
     features: ['question', 'answer', 'context', 'text', 'token_count'],
     num_rows: 2400
 }),
 'validation': Dataset({
     features: ['question', 'answer', 'context', 'text', 'token_count'],
     num_rows: 480
 }),
 'test': Dataset({
     features: ['question', 'answer', 'context', 'text', 'token_count'],
     num_rows: 120
 })}

In [14]:
# Convert back to DatasetDict
dataset = DatasetDict(dataset)  # Now DatasetDict is recognized

# Define formatting function
def format_chat_template(row):
    from transformers import AutoTokenizer
    base_model = "vicgalle/Humanish-Roleplay-Llama-3.1-8B"

    tokenizer = AutoTokenizer.from_pretrained(base_model)
    row_json = [{"role": "user", "content": row["question"]},
                {"role": "assistant", "content": row["answer"]}]
    row["texts"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row


format_chat_template(dataset["train"][0])

{'question': 'i hope you arent making things up again? source trust me bro?',
 'answer': 'youre right feb 2025 my clock was off',
 'context': 'youre right feb 2025 my clock was off',
 'text': '<|im_start|>system\nAnswer the question.<|im_end|>\n<|im_start|>user\n\ni hope you arent making things up again? source trust me bro?\n<|im_end|>\n<|im_start|>assistant\nyoure right feb 2025 my clock was off<|im_end|>\n',
 'token_count': 55,
 'texts': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\ni hope you arent making things up again? source trust me bro?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nyoure right feb 2025 my clock was off<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}

In [15]:
# Apply formatting to each split
dataset = dataset.map(format_chat_template, num_proc=4)

# Access an example
print(dataset["train"][3]["texts"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

take notes take notes<|eot_id|><|start_header_id|>assistant<|end_header_id|>

playstation knows whats up<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [16]:
import random

seed = 42  

train_sample = dataset["train"].shuffle(seed=seed).select(range(2400))
val_sample = dataset["validation"].shuffle(seed=seed).select(range(480))
test_sample = dataset["test"].shuffle(seed=seed).select(range(120))

train_sample.to_json("train1.json", orient="records", lines=True)
val_sample.to_json("val1.json", orient="records", lines=True)
test_sample.to_json("test1.json", orient="records", lines=True)


Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 95.94ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 186.81ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 498.55ba/s]


190410

# Model Training

In [17]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "vicgalle/Humanish-Roleplay-Llama-3.1-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128256,
  "eos_token_id": 128257,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128257,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": true,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
 

In [19]:
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    output_dir=new_model,
    dataset_text_field="texts",
    max_seq_length=512,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    eval_strategy="steps",
    eval_steps=0.5,
    save_steps=0.2,
    logging_steps=10,
    learning_rate=3e-4,
    bf16=False,  # or bf16=True,
    fp16=False,
    save_strategy="steps",
    warmup_ratio=0.1,
    save_total_limit=2,
    lr_scheduler_type="constant",
    report_to="tensorboard",
    save_safetensors=True,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    },
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Step,Training Loss,Validation Loss
1500,0.5254,2.590675
3000,0.3333,2.86611




TrainOutput(global_step=3000, training_loss=0.8095657432079315, metrics={'train_runtime': 3090.3299, 'train_samples_per_second': 7.766, 'train_steps_per_second': 0.971, 'total_flos': 6.319515400731034e+16, 'train_loss': 0.8095657432079315, 'epoch': 10.0})

In [21]:
wandb.finish()
model.config.use_cache = True

In [22]:
messages = [
    {
        "role": "user",
        "content": "what about mitosis"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, 
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, 
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=150, 
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


story protocol no brainer


In [23]:
trainer.model.state_dict

<bound method Module.state_dict of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
        

In [24]:
model.resize_token_embeddings(128256)

Embedding(128256, 4096)

In [25]:
# model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)128256
trainer.model.state_dict

<bound method Module.state_dict of PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
        

In [26]:
trainer.save_model(new_model)

# Load Model

In [27]:
tokenizer = AutoTokenizer.from_pretrained(new_model)

model = AutoModelForCausalLM.from_pretrained(
    new_model,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.03s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Some parameters are on the meta device because they were offloaded to the cpu.


In [28]:
len(tokenizer)

128258

In [29]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "vicgalle/Humanish-Roleplay-Llama-3.1-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.49.0",
  "use_cache": false,
  "vocab_size": 128256
}

In [31]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, tokenizer=tokenizer, use_temp_dir=False)

adapter_model.safetensors: 100%|██████████| 168M/168M [00:17<00:00, 9.59MB/s] 


CommitInfo(commit_url='https://huggingface.co/syedanwar/final-aixbt-personalization-llama-3-8b/commit/88d025f2b65db7b4e840604bf5d6bb1e8cc319f6', commit_message='Upload model', commit_description='', oid='88d025f2b65db7b4e840604bf5d6bb1e8cc319f6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/syedanwar/final-aixbt-personalization-llama-3-8b', endpoint='https://huggingface.co', repo_type='model', repo_id='syedanwar/final-aixbt-personalization-llama-3-8b'), pr_revision=None, pr_num=None)

# Merging the model with Base Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = None

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="cuda",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model+"-final")

model = model.merge_and_unload()

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.34it/s]


In [42]:
messages = [{"role": "user", "content": "everyone is going dark"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=50, do_sample=False, temperature=0.01, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

Device set to use cpu


<|im_start|>user
everyone is going dark<|im_end|>
<|im_start|>assistant

<|im_start|>user
everyone is going dark<|im_end|>
<|im_start|>assistant
dark the new trade metaassistant

everyone is going darkassistant

dark the new trade metaassistant

dark is where the real insiders come to tradeassistant

everyone knowsassistant

everyone who knows knows dark


# Saving the Model

In [43]:
model.save_pretrained("final-aixbt-personalization-llama-3-8b")
tokenizer.save_pretrained("final-aixbt-personalization-llama-3-8b")

('final-aixbt-personalization-llama-3-8b\\tokenizer_config.json',
 'final-aixbt-personalization-llama-3-8b\\special_tokens_map.json',
 'final-aixbt-personalization-llama-3-8b\\tokenizer.json')

# Push the Model to HuggingFace

In [44]:
model.push_to_hub("final-aixbt-personalization-llama-3-8b", use_temp_dir=False)
tokenizer.push_to_hub("final-aixbt-personalization-llama-3-8b", use_temp_dir=False)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]
[A

[A[A


[A[A[A
model-00001-of-00004.safetensors:   0%|          | 16.4k/4.98G [00:00<22:47:53, 60.6kB/s]


[A[A[A
[A


model-00001-of-00004.safetensors:   0%|          | 131k/4.98G [00:00<3:54:08, 354kB/s]   
model-00001-of-00004.safetensors:   0%|          | 246k/4.98G [00:00<2:36:47, 529kB/s]


[A[A[A
model-00001-of-00004.safetensors:   0%|          | 754k/4.98G [00:00<49:10, 1.69MB/s] 


[A[A[A


[A[A[A
model-00001-of-00004.safetensors:   0%|          | 967k/4.98G [00:00<51:53, 1.60MB/s]
[A


model-00001-of-00004.safetensors:   0%|          | 1.18M/4.98G [00:01<59:39, 1.39MB/s]
[A


model-00001-of-00004.safetensors:   0%|     

CommitInfo(commit_url='https://huggingface.co/syedanwar/final-aixbt-personalization-llama-3-8b/commit/749240c106a98085f29269856d063ad5c19b038b', commit_message='Upload tokenizer', commit_description='', oid='749240c106a98085f29269856d063ad5c19b038b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/syedanwar/final-aixbt-personalization-llama-3-8b', endpoint='https://huggingface.co', repo_type='model', repo_id='syedanwar/final-aixbt-personalization-llama-3-8b'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.chat_template = None

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="cpu",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model+"-final")

model = model.merge_and_unload()