In [8]:
from transformers import (LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments,
AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList, BitsAndBytesConfig, pipeline)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import torch
import re
from datasets import load_dataset
from IPython.display import clear_output 
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# load the model
model_name = "meta-llama/Llama-3.2-3B-Instruct"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Some models don't have a dedicated padding token, so using the EOS token as a pad token is a common practice
tokenizer.padding_side = "right" # configures the tokenizer to add padding to the right side of the sequences

# quantization_config
quant_config = BitsAndBytesConfig(load_in_4bit=True,  # model is quantizied to 4bit format
                                  bnb_4bit_quant_type="nf4",  # using normalized float 4 method to quantizied
                                  bnb_4bit_compute_dtype="float16",  # computing with 16 bit float
                                  bnb_4bit_use_double_quant=False)  # no double quantizie

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config = quant_config,
)
model = PeftModel.from_pretrained(model, "./fine_tuned_model-llama")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
from langchain_core.prompts import PromptTemplate

def text_for_input(user_prompt=None, system_prompt=None):
    user_prompt_template="""\
<|begin_of_text|>\
<|start_header_id|>user<|end_header_id|>\
{user_prompt} \
<|eot_id|>\
<|start_header_id|>assistant<|end_header_id|>\
"""
    system_prompt_template="""\
<|begin_of_text|>\
<|start_header_id|>system<|end_header_id|>\
{system_prompt} \
<|eot_id|>\
<|start_header_id|>user<|end_header_id|>\
{user_prompt}\
<|eot_id|>\
<|start_header_id|>assistant<|end_header_id|>\
"""
    if user_prompt != None and system_prompt == None:
        prompt_object = PromptTemplate(template=user_prompt_template)
        return prompt_object.format(user_prompt=user_prompt)
    elif user_prompt != None and system_prompt != None:
        prompt_object = PromptTemplate(template=system_prompt_template)
        return prompt_object.format(system_prompt=system_prompt, user_prompt=user_prompt)
    return "no input"

system_for_fine_tune="""\
You are a classification assistant tasked with categorizing a given domain name into one of three labels: normal, dictionary, or random.
Definitions:
Normal: A legitimate, standard domain name (e.g., google.com, meta.com).
Dictionary: A domain generated algorithmically using dictionary words. Examples:
    Simple: daylikesheltersoildistrict.com
    More advanced: availablesolutions.com
Random: A domain generated algorithmically as a random string of characters (e.g., wwmwqiqsicmgicwg.org).
Instructions:
Your response must contain only one of the following labels:
    normal for legitimate domains
    dictionary for dictionary-based domain generation algorithms domains
    random for randomly generated domain generation algorithms domains
Do not include any additional text or commentary.
"""
# load the data
dataset_name = "data/balanced_record_without_llm-2.csv"
dataset = load_dataset("csv", data_files=dataset_name)

prompt_template = text_for_input(user_prompt="{domain}", system_prompt=system_for_fine_tune) + "{label}<|eot_id|>"
prompt_template = PromptTemplate(template=prompt_template)
def new_output(example):
    if example["default/class"] == 0:
        return{"new_class": "normal"}
    elif example["default/class"] == 1:
        return{"new_class": "random"}
    else:
        return{"new_class": "dictionary"}

def create_prompt(example):
    return {
        "prompt": prompt_template.format(
            domain=example["default/domain"],
            label=example["new_class"]
        )
    }

def tokenize_and_mask(example):
    full_text = example["prompt"]
    # we only want to know the part that assistant output
    assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
    marker_index = full_text.find(assistant_marker)
    if marker_index == -1:
        raise ValueError("Assistant marker not found in prompt.")
    
    prompt_part = full_text[:marker_index + len(assistant_marker)]
    
    tokenized = tokenizer(full_text, truncation=True, max_length=200, padding="max_length")
    
    prompt_tokenized = tokenizer(prompt_part, truncation=True, max_length=200)
    prompt_length = len(prompt_tokenized["input_ids"])
    
    labels = tokenized["input_ids"].copy()
    labels[:prompt_length] = [-100] * prompt_length
    tokenized["labels"] = labels
    return tokenized

def mask_answer(prompt_text):
    assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
    if assistant_marker in prompt_text:
        return prompt_text.split(assistant_marker)[0] + assistant_marker
    else:
        return prompt_text
def predict_fn(example):
    test_input = example['test_input']
    max_length = len(tokenizer(test_input)['input_ids']) + 1
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length, logits_processor=logits_processor)
    result = pipe(test_input)
    
    match = re.search(r"<\|start_header_id\|>assistant<\|end_header_id\|>(\S+)", result[0]['generated_text'])
    clear_output(wait=True)
    
    return {"predict": match.group(1) if match else 'error'}


In [None]:
# map is the apply in pd
new_dataset = dataset.map(new_output)
new_dataset = new_dataset.map(create_prompt)
new_dataset = new_dataset.map(tokenize_and_mask)
new_dataset = new_dataset.map(lambda x: {"test_input": mask_answer(x["prompt"])})
# new_dataset = new_dataset.remove_columns(['default/domain', 'default/label', 'default/class', 'new_class',])
# train test splite
splited_dataset = new_dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = splited_dataset["train"]
test_dataset = splited_dataset["test"]

In [11]:
# making model generate fixed output by modifying the output token choice
class RestrictLogitsProcessor(LogitsProcessor):
    def __init__(self, allowed_tokens):
        self.allowed_tokens = allowed_tokens  # allowed token
    
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # making all token except allowed token to -inf
        restricted_scores = torch.full_like(scores, float('-inf'))
        restricted_scores[:, self.allowed_tokens] = scores[:, self.allowed_tokens]
        return restricted_scores

# restrict the model word choice to only these 3 token are allowed to generate
allowed_words = ["normal", "random", "dictionary", "<|eot_id|>"]
allowed_tokens = []
for word in allowed_words:
    token_ids = tokenizer.encode(word, add_special_tokens=False)
    allowed_tokens.extend(token_ids)
logits_processor = LogitsProcessorList()
logits_processor.append(RestrictLogitsProcessor(allowed_tokens))

In [10]:
import os
import pandas as pd
batch_size = 100
output_csv = "dataset_results.csv"

if os.path.exists(output_csv):
    processed_df = pd.read_csv(output_csv)
    processed_count = len(processed_df)
    print(f"Resuming from {processed_count} samples.")
else:
    processed_df = pd.DataFrame()
    processed_count = 0
    print("Starting from scratch.")

test_dataset = dataset["train"]

remaining_dataset = test_dataset.select(range(processed_count, len(test_dataset)))

for i in range(0, len(remaining_dataset), batch_size):
    batch = remaining_dataset.select(range(i, min(i + batch_size, len(remaining_dataset))))
    
    batch_processed = batch.map(predict_fn)
    
    batch_df = batch_processed.to_pandas()
    processed_df = pd.concat([processed_df, batch_df], ignore_index=True)
    
    processed_df.to_csv(output_csv, index=False)
    current_total = processed_count + i + len(batch)
    print(f"Saved up to sample {current_total}")
    clear_output(wait=True)
    
print("All predictions processed and saved.")


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

mbaForCausalLM'].


Saved up to sample 100


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftMZambaForCausalLM'].
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM',

Saved up to sample 16600


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 16700


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 16800


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 16900


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 17000


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 17100


Map:   0%|          | 0/23 [00:00<?, ? examples/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausa

Saved up to sample 17123
All predictions processed and saved.


In [15]:
# llm generated dataset
df = pd.read_csv("./data/dataset_results.csv")
y_true = df['new_class']
y_pred = df['predict']
print("Classification Report:")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Classification Report:
              precision    recall  f1-score   support

  dictionary     1.0000    0.1838    0.3105     17123
      normal     0.0000    0.0000    0.0000         0
      random     0.0000    0.0000    0.0000         0

    accuracy                         0.1838     17123
   macro avg     0.3333    0.0613    0.1035     17123
weighted avg     1.0000    0.1838    0.3105     17123

Accuracy: 0.1838
Precision: 1.0000
Recall: 0.1838
F1 Score: 0.3105


In [13]:
# original dataset
df = pd.read_csv("./data/test_dataset_results_llama.csv")
y_true = df['new_class']
y_pred = df['predict']
print("Classification Report:")
print(classification_report(y_true, y_pred, digits=4))

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Classification Report:
              precision    recall  f1-score   support

  dictionary     0.9605    0.9359    0.9481        78
      normal     0.9639    0.8511    0.9040        94
      random     0.8272    0.9853    0.8993        68

    accuracy                         0.9167       240
   macro avg     0.9172    0.9241    0.9171       240
weighted avg     0.9240    0.9167    0.9170       240

Accuracy: 0.9167
Precision: 0.9240
Recall: 0.9167
F1 Score: 0.9170
