In [1]:
import torch
from transformers import (LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments,
AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList, BitsAndBytesConfig, pipeline)
import torch
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from dotenv import load_dotenv
import os
from torchtune.modules.peft import LoRALinear
from trl import SFTTrainer, SFTConfig

import error: No module named 'triton'


In [14]:
load_dotenv()
hugging_face_token = os.getenv("HUGGING_FACE_TOKEN")

token index for:

normal: [8416]

dictionary dga: [36771, 294, 6885] -> dictionary/d/ga

random dga: [11719, 294, 6885] -> random/d/ga

In [15]:
# load the model
model_name = "meta-llama/Llama-3.2-3B-Instruct"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Some models don't have a dedicated padding token, so using the EOS token as a pad token is a common practice
tokenizer.padding_side = "right" # configures the tokenizer to add padding to the right side of the sequences

# quantization_config
quant_config = BitsAndBytesConfig(load_in_4bit=True,  # model is quantizied to 4bit format
                                  bnb_4bit_quant_type="nf4",  # using normalized float 4 method to quantizied
                                  bnb_4bit_compute_dtype="float16",  # computing with 16 bit float
                                  bnb_4bit_use_double_quant=False)  # no double quantizie

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config = quant_config,
    token=hugging_face_token
)

model.config.use_cache = False # not using KV cache as it might require more memory
model.config.pretraining_tp = 1 # not using tensor parallelism

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
# making model generate fixed output by modifying the output token choice
class RestrictLogitsProcessor(LogitsProcessor):
    def __init__(self, allowed_tokens):
        self.allowed_tokens = allowed_tokens  # allowed token
    
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # making all token except allowed token to -inf
        restricted_scores = torch.full_like(scores, float('-inf'))
        restricted_scores[:, self.allowed_tokens] = scores[:, self.allowed_tokens]
        return restricted_scores

# restrict the model word choice to only these 3 token are allowed to generate
allowed_words = ["normal", "random", "dictionary", "<|eot_id|>"]
allowed_tokens = []
for word in allowed_words:
    token_ids = tokenizer.encode(word, add_special_tokens=False)
    allowed_tokens.extend(token_ids)
logits_processor = LogitsProcessorList()
logits_processor.append(RestrictLogitsProcessor(allowed_tokens))

In [17]:
tokenizer

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-3B-Instruct', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>', 'pad_token': '<|eot_id|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|r

In [18]:
from langchain_core.prompts import PromptTemplate

def text_for_input(user_prompt=None, system_prompt=None):
    user_prompt_template="""\
<|begin_of_text|>\
<|start_header_id|>user<|end_header_id|>\
{user_prompt} \
<|eot_id|>\
<|start_header_id|>assistant<|end_header_id|>\
"""
    system_prompt_template="""\
<|begin_of_text|>\
<|start_header_id|>system<|end_header_id|>\
{system_prompt} \
<|eot_id|>\
<|start_header_id|>user<|end_header_id|>\
{user_prompt}\
<|eot_id|>\
<|start_header_id|>assistant<|end_header_id|>\
"""
    if user_prompt != None and system_prompt == None:
        prompt_object = PromptTemplate(template=user_prompt_template)
        return prompt_object.format(user_prompt=user_prompt)
    elif user_prompt != None and system_prompt != None:
        prompt_object = PromptTemplate(template=system_prompt_template)
        return prompt_object.format(system_prompt=system_prompt, user_prompt=user_prompt)
    return "no input"

In [19]:
system_for_fine_tune="""\
You are a classification assistant tasked with categorizing a given domain name into one of three labels: normal, dictionary, or random.
Definitions:
Normal: A legitimate, standard domain name (e.g., google.com, meta.com).
Dictionary: A domain generated algorithmically using dictionary words. Examples:
    Simple: daylikesheltersoildistrict.com
    More advanced: availablesolutions.com
Random: A domain generated algorithmically as a random string of characters (e.g., wwmwqiqsicmgicwg.org).
Instructions:
Your response must contain only one of the following labels:
    normal for legitimate domains
    dictionary for dictionary-based domain generation algorithms domains
    random for randomly generated domain generation algorithms domains
Do not include any additional text or commentary.
"""

In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

In [20]:
# load the data
from datasets import load_dataset
dataset_name = "data/balanced_record_without_llm-2.csv"
dataset = load_dataset("csv", data_files=dataset_name)

prompt_template = text_for_input(user_prompt="{domain}", system_prompt=system_for_fine_tune) + "{label}<|eot_id|>"
prompt_template = PromptTemplate(template=prompt_template)
def new_output(example):
    if example["default/class"] == 0:
        return{"new_class": "normal"}
    elif example["default/class"] == 1:
        return{"new_class": "random"}
    else:
        return{"new_class": "dictionary"}

def create_prompt(example):
    return {
        "prompt": prompt_template.format(
            domain=example["default/domain"],
            label=example["new_class"]
        )
    }

def tokenize_and_mask(example):
    full_text = example["prompt"]
    # we only want to know the part that assistant output
    assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
    marker_index = full_text.find(assistant_marker)
    if marker_index == -1:
        raise ValueError("Assistant marker not found in prompt.")
    
    prompt_part = full_text[:marker_index + len(assistant_marker)]
    
    tokenized = tokenizer(full_text, truncation=True, max_length=200, padding="max_length")
    
    prompt_tokenized = tokenizer(prompt_part, truncation=True, max_length=200)
    prompt_length = len(prompt_tokenized["input_ids"])
    
    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

# map is the apply in pd
new_dataset = dataset.map(new_output)
new_dataset = new_dataset.map(create_prompt)
new_dataset = new_dataset.map(tokenize_and_mask)
new_dataset = new_dataset.remove_columns(['default/domain', 'default/label', 'default/class', 'new_class',])
# train test splite
splited_dataset = new_dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = splited_dataset["train"]
test_dataset = splited_dataset["test"]

In [21]:
train_dataset

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 960
})

In [22]:
prompt=train_dataset['prompt'][0]
input_ids=train_dataset['input_ids'][0]
attention_mask=train_dataset['attention_mask'][0]
labels=train_dataset['labels'][0]

print('this is a demo input, where the original prompt is:\n', prompt)
print('-----------------\n and its tokenization format is:\n',tokenizer(prompt))
print('-----------------\n then its input became:\n',input_ids)
print('-----------------\n then its output became:\n',labels)
print('-----------------\n then its attention:\n',attention_mask)

this is a demo input, where the original prompt is:
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a classification assistant tasked with categorizing a given domain name into one of three labels: normal, dictionary, or random.
Definitions:
Normal: A legitimate, standard domain name (e.g., google.com, meta.com).
Dictionary: A domain generated algorithmically using dictionary words. Examples:
    Simple: daylikesheltersoildistrict.com
    More advanced: availablesolutions.com
Random: A domain generated algorithmically as a random string of characters (e.g., wwmwqiqsicmgicwg.org).
Instructions:
Your response must contain only one of the following labels:
    normal for legitimate domains
    dictionary for dictionary-based domain generation algorithms domains
    random for randomly generated domain generation algorithms domains
Do not include any additional text or commentary.
 <|eot_id|><|start_header_id|>user<|end_header_id|>medicineconsultquitavoid.com<|eot_id|><

In [23]:
# lora
lora_config = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,  # text classification
    target_modules=["v_proj", "embed_tokens"]
)

# training_arguments = TrainingArguments(
#     output_dir="./results", # directory to store the model predictions and checkpoints
#     num_train_epochs=5, # number of epochs
#     per_device_train_batch_size=4, # batch size for training per GPU or core CPU
#     per_device_eval_batch_size=4, # batch size for evaluating per GPU or core CPU
#     gradient_accumulation_steps=1, # number of updates steps to accumulate the gradients for, before performing a backward/update pass
#     optim="paged_adamw_32bit", # the optimizer to use
#     save_steps=50, # number of updates steps before two checkpoint saves
#     logging_steps=50, # number of update steps between two logs
#     learning_rate=2e-4, # 0.0002 initial learning rate for [`AdamW`] optimizer
#     lr_scheduler_type="constant", # the scheduler type to use
#     warmup_ratio=0.03, # ratio of total training steps used for a linear warmup from 0 to `learning_rate`
#     weight_decay=0.001, # the weight decay to apply to all layers except all bias and LayerNorm weights in [`AdamW`] optimizer
#     fp16=False, # whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
#     bf16=False, # whether to use bfp16 16-bit (mixed) precision training instead of 32-bit training
#     max_grad_norm=0.3, # maximum gradient norm (for gradient clipping)
#     max_steps=-1, # if set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`    group_by_length=True, # Whether or not to group together samples of roughly the same length in the training dataset
#     report_to="tensorboard", # the list of integrations to report the results and logs to - [wandb, mlflow, comet_ml, neptune...]
# )

# training argument from: src: https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
training_arguments = SFTConfig(
    output_dir="./llama-results-with-1200records-ds", # directory to store the model predictions and checkpoints
    num_train_epochs=5, # number of epochs
    per_device_train_batch_size=2, # batch size for training per GPU or core CPU
    per_device_eval_batch_size=2, # batch size for evaluating per GPU or core CPU
    gradient_accumulation_steps=1, # number of updates steps to accumulate the gradients for, before performing a backward/update pass
    optim="paged_adamw_32bit", # the optimizer to use
    save_steps=10, # number of updates steps before two checkpoint saves
    save_total_limit=5,
    logging_steps=50, # number of update steps between two logs
    learning_rate=2e-4, # 0.0002 initial learning rate for [`AdamW`] optimizer
    lr_scheduler_type="constant", # the scheduler type to use
    warmup_ratio=0.03, # ratio of total training steps used for a linear warmup from 0 to `learning_rate`
    weight_decay=0.001, # the weight decay to apply to all layers except all bias and LayerNorm weights in [`AdamW`] optimizer
    # fp16=True, # whether to use fp16 16-bit (mixed) precision training instead of 32-bit training
    # bf16=False, # whether to use bfp16 16-bit (mixed) precision training instead of 32-bit training
    max_grad_norm=0.3, # maximum gradient norm (for gradient clipping)
    max_steps=-1, # if set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`    group_by_length=True, # Whether or not to group together samples of roughly the same length in the training dataset
    report_to="tensorboard", # the list of integrations to report the results and logs to - [wandb, mlflow, comet_ml, neptune...]
    max_seq_length=200, # max
)

trainer = SFTTrainer(
    model=model, # model to train
    train_dataset=train_dataset, # the training dataset
    eval_dataset=test_dataset, # the evaluation dataset
    peft_config=lora_config, # from LoRA Configuration
    tokenizer=tokenizer, # model tokenizer
    args=training_arguments, # the training parameters
)

  trainer = SFTTrainer(


In [24]:
total_params = sum(p.numel() for p in trainer.model.parameters())
print("Total parameters:", total_params)

trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
print("Trainable parameters:", trainable_params)

Total parameters: 1805431808
Trainable parameters: 1968128


In [26]:
trainer.train('./llama-results-with-1200records-ds/checkpoint-2390')

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
2400,0.2422




TrainOutput(global_step=2400, training_loss=0.001009326974550883, metrics={'train_runtime': 13.2216, 'train_samples_per_second': 363.041, 'train_steps_per_second': 181.521, 'total_flos': 1.62473213952e+16, 'train_loss': 0.001009326974550883, 'epoch': 5.0})

In [27]:
# Save trained model
trainer.save_model("./fine_tuned_model-llama")
tokenizer.save_pretrained("./fine_tuned_model-llama")



('./fine_tuned_model-llama\\tokenizer_config.json',
 './fine_tuned_model-llama\\special_tokens_map.json',
 './fine_tuned_model-llama\\tokenizer.json')

In [28]:
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

Evaluation results: {'eval_loss': 0.24886220693588257, 'eval_runtime': 57.5681, 'eval_samples_per_second': 4.169, 'eval_steps_per_second': 2.084, 'epoch': 5.0}


In [3]:
# load the model
model_name = "meta-llama/Llama-3.2-3B-Instruct"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Some models don't have a dedicated padding token, so using the EOS token as a pad token is a common practice
tokenizer.padding_side = "right" # configures the tokenizer to add padding to the right side of the sequences

# quantization_config
quant_config = BitsAndBytesConfig(load_in_4bit=True,  # model is quantizied to 4bit format
                                  bnb_4bit_quant_type="nf4",  # using normalized float 4 method to quantizied
                                  bnb_4bit_compute_dtype="float16",  # computing with 16 bit float
                                  bnb_4bit_use_double_quant=False)  # no double quantizie

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config = quant_config,
)
model = PeftModel.from_pretrained(model, "./fine_tuned_model-llama")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]