In [None]:
"""
pip install transformers==4.44.2
pip install accelerate==0.33.0
pip install peft==0.14.0
pip install datasets
pip install sentencepiece
pip install bitsandbytes
pip install hf-xet==1.2.0
"""

"""
(.venv) PS D:\claimpkg\claimpkg-clone> huggingface-cli login
‚ö†Ô∏è  Warning: 'huggingface-cli login' is deprecated. Use 'hf auth loogin' instead.

    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token can be pasted using 'Right-Click'.
Enter your token (input will not be visible):
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `Llama-3.2-1B-TNG-token` has been saved to C:\Users\tungq\.cache\huggingface\stored_tokens
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\tungq\.cache\huggingface\token
Login successful.
The current active token is: `Llama-3.2-1B-TNG-token`
"""

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

MODEL_ID = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [4]:
# Check if model is runable now

text = "Claim: Barack Obama was born in Hawaii."
inputs = tokenizer(text, return_tensors="pt").to(model.device)


out = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(out[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Claim: Barack Obama was born in Hawaii. It's a lie.
By: The Daily Caller | August 25, 2012
A


In [5]:
from peft import LoraConfig, get_peft_model

lora = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj"]
)

model = get_peft_model(model, lora)
model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 1,242,630,144 || trainable%: 0.5485


In [6]:
import os
import pickle

DATA_DIR = 'resources'
# Data dir = (1) working directory, (2) move out of test, (3) move out of src, and append to resources
DATA_DIR = os.path.join(os.getcwd(), '..', 'resources')

TRAIN_FILE = 'finetune_train_data.pickle'
TEST_FILE = 'finetune_test_data.pickle'
VALID_FILE = 'finetune_validation_data.pickle'

TRAIN_FILE_PATH = os.path.join(DATA_DIR, TRAIN_FILE)
TEST_FILE_PATH = os.path.join(DATA_DIR, TEST_FILE)
VALID_FILE_PATH = os.path.join(DATA_DIR, VALID_FILE)

train_data = None
test_data = None
valid_data = None

# Load
with open(TRAIN_FILE_PATH, 'rb') as f:
    train_data = pickle.load(f)
with open(TEST_FILE_PATH, 'rb') as f:
    test_data = pickle.load(f)
with open(VALID_FILE_PATH, 'rb') as f:
    valid_data = pickle.load(f)

In [7]:
from datasets import Dataset

train_ds = Dataset.from_list(train_data)
val_ds   = Dataset.from_list(valid_data)
test_ds  = Dataset.from_list(test_data)

def format_fn(example):
    example["prompt"] = f"Claim: {example['input']}\nGenerate pseudo-subgraph:\n"
    example["labels"] = example["output"]
    return example

train_ds = train_ds.map(format_fn)
val_ds   = val_ds.map(format_fn)
test_ds  = test_ds.map(format_fn)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
def tokenize_fn(batch):
    """
    Tokenize prompts and labels for causal LM fine-tuning.
    For causal LM, we concatenate prompt + labels and use them as input_ids.
    Labels are the same as input_ids, but we mask the prompt part with -100
    so loss is only computed on the generated output tokens.
    """
    # Concatenate prompt and labels to form the full text
    full_texts = [
        prompt + label
        for prompt, label in zip(batch["prompt"], batch["labels"])
    ]

    # Tokenize prompts separately to know where to mask
    prompt_lengths = []
    for prompt in batch["prompt"]:
        prompt_tok = tokenizer(prompt, truncation=False, padding=False)
        prompt_lengths.append(len(prompt_tok["input_ids"]))

    # Tokenize full text with padding and truncation enabled
    tokenized = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",  # pad to max_length for uniform batches
        max_length=512,  # reduced from 1024 to avoid memory issues; adjust as needed
        return_tensors=None,  # return lists, Trainer will convert to tensors
    )

    # Create labels: copy input_ids and mask prompt tokens with -100
    labels = []
    for i, input_ids in enumerate(tokenized["input_ids"]):
        label = input_ids.copy()
        # Mask prompt tokens (set to -100 so they're ignored in loss)
        prompt_len = min(prompt_lengths[i], len(label))
        for j in range(prompt_len):
            label[j] = -100
        labels.append(label)

    tokenized["labels"] = labels

    return tokenized

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(tokenize_fn, batched=True, remove_columns=val_ds.column_names)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    fp16=True,
    logging_steps=20,
    save_steps=200,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
)

trainer.train()




  0%|          | 0/624 [00:00<?, ?it/s]



In [None]:
import transformers


transformers.__version__

'4.44.2'