## Uploading Cleaned Dataset to Hugging Face with train/test Split.

In [None]:
!pip install huggingface_hub

from huggingface_hub import login
# Log in using your Hugging Face token
login(token="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")



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
dataset_file = "/content/CFA-CPA_merged_augmented_v2.csv"  # Path to your cleaned dataset
df = pd.read_csv(dataset_file)

# Split into train (80%) and test (20%)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save the splits as CSV files
train_file = "/content/train.csv"
test_file = "/content/test.csv"
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

In [None]:
from huggingface_hub import create_repo, upload_file

# Define repository details
username = "Ali-PYT"  # Replace with your username
dataset_name = "Llama-3.2_CQA"  # Replace with your dataset name
repo_id = f"{username}/{dataset_name}"

# Create repository (if it doesn't already exist)
create_repo(repo_id, repo_type="dataset", exist_ok=True)

# Upload the train and test splits
upload_file(
    path_or_fileobj=train_file,
    path_in_repo="train.csv",  # Train file in the repo
    repo_id=repo_id,
    repo_type="dataset"
)

upload_file(
    path_or_fileobj=test_file,
    path_in_repo="test.csv",  # Test file in the repo
    repo_id=repo_id,
    repo_type="dataset"
)

print(f"Train and test splits uploaded to: https://huggingface.co/datasets/{repo_id}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train and test splits uploaded to: https://huggingface.co/datasets/Ali-PYT/Llama-3.2_CQA


## Loading Base Llama 3.2 3B

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.11.9: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
# Configure tokenizer padding settings
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                   "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
)

Unsloth 2024.11.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


## Dataset Prep

In [None]:
cot_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Provide step-by-step reasoning before giving your final response.

### Instruction:
{instruction}

### Input:
{input}

### Response:
Let's solve this step by step:
{output}"""

def formatting_prompts_func(examples):
    texts = []
    for instruction, input_text, output in zip(
        examples["instruction"],
        examples["input"],
        examples["output"]
    ):
        # Handle potential None values
        instruction = instruction if instruction is not None else ""
        input_text = input_text if input_text is not None else ""
        output = output if output is not None else ""

        text = cot_prompt.format(
            instruction=instruction,
            input=input_text,
            output=output
        )
        texts.append(text)

    tokenized = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_seq_length,
        return_tensors=None
    )
    return tokenized

In [None]:
# Prepare dataset
from datasets import load_dataset
dataset = load_dataset("Ali-PYT/Llama-3.2_CQA", split="train")
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
    remove_columns=dataset.column_names
)

train.csv:   0%|          | 0.00/657k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/183k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/999 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

## Training



In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    max_steps=200,
    learning_rate=5e-5,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)

max_steps is given, it will override any value given in num_train_epochs


We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part="### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    response_part="Let's solve this step by step:\n",
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
4.768 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 999 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,0.721
2,0.6773
3,1.2911
4,1.398
5,0.673
6,0.8944
7,1.3892
8,0.7004
9,0.6356
10,0.4857


Step,Training Loss
1,0.721
2,0.6773
3,1.2911
4,1.398
5,0.673
6,0.8944
7,1.3892
8,0.7004
9,0.6356
10,0.4857


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

446.5262 seconds used for training.
7.44 minutes used for training.
Peak reserved memory = 6.531 GB.
Peak reserved memory for training = 3.896 GB.
Peak reserved memory % of max memory = 44.284 %.
Peak reserved memory for training % of max memory = 26.417 %.


### GGUF / llama.cpp Conversion

In [None]:
# Save to 16bit GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")