## Fine-tuning using dora

Model: deepseek
Dataset: financialPhraseBank - from finbert 
Techniques: dora
- data augmentation (to consider if data is too little)
Using ollama

Load data

In [1]:
%tb
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    LlamaTokenizerFast,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    default_data_collator,
AutoConfig
)
import torch


import os
import pandas as pd

data_dir = (
    "/home/m/mehrad/brikiyou/scratch/ift6289/IFT6289-project/data/"
    "FinancialPhraseBank-v1.0"
)
out_dir = "data/sentiment_data"

os.makedirs(out_dir, exist_ok=True)

train_files = [
    "Sentences_75Agree_utf8.txt",
    "Sentences_AllAgree_utf8.txt",
    "augmented_Sentences_66Agree_utf8.txt",
    "augmented_Sentences_AllAgree_utf8.txt",
    "augmented_Sentences_75Agree_utf8.txt"
]
val_file = "Sentences_66Agree_utf8.txt"
test_file = "Sentences_50Agree_utf8.txt"

def load_fpbank(path):
    return pd.read_csv(
        path,
        sep="\t",
        names=["text", "label"],
        encoding="utf-8"
    )

train_dfs = []
for fname in train_files:
    full = os.path.join(data_dir, fname)
    train_dfs.append(load_fpbank(full))
train_df = pd.concat(train_dfs, ignore_index=True)

valid_df = load_fpbank(os.path.join(data_dir, val_file))
test_df  = load_fpbank(os.path.join(data_dir, test_file))

train_path = os.path.join(out_dir, "train.csv")
valid_path = os.path.join(out_dir, "validation.csv")
test_path  = os.path.join(out_dir, "test.csv")

train_df.to_csv(train_path, index=False)
valid_df.to_csv(valid_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"Wrote {len(train_df)} train examples to {train_path}")
print(f"Wrote {len(valid_df)} validation examples to {valid_path}")
print(f"Wrote {len(test_df)} test examples to {test_path}")


No traceback available to show.


Wrote 35519 train examples to data/sentiment_data/train.csv
Wrote 4217 validation examples to data/sentiment_data/validation.csv
Wrote 4846 test examples to data/sentiment_data/test.csv


### TRAINING STARTS HERE

In [2]:
from huggingface_hub import notebook_login

In [3]:
"""
import wandb
wandb.login()
"""

'\nimport wandb\nwandb.login()\n'

In [4]:
%env HF_HUB_ENABLE_HF_TRANSFER=True

env: HF_HUB_ENABLE_HF_TRANSFER=True


In [5]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
cache_dir = "/home/m/mehrad/brikiyou/scratch/huggingface_cache"

import os
os.environ["TRITON_CACHE_DIR"] = "/home/m/mehrad/brikiyou/scratch/triton_cache"
os.environ["TRITON_HOME"]      = "/home/m/mehrad/brikiyou/scratch/triton_home"
os.environ["CC"]  = "/scinet/balam/rocky9/software/2023a/opt/cuda-12.3.1/gcc/12.3.0/bin/gcc"
os.environ["CXX"] = "/scinet/balam/rocky9/software/2023a/opt/cuda-12.3.1/gcc/12.3.0/bin/g++"
os.environ["HF_HOME"]            = "/home/m/mehrad/brikiyou/scratch/huggingface_cache"
os.environ["HF_HUB_CACHE"]       = os.path.join(os.environ["HF_HOME"], "hub")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "models")
os.environ["HF_DATASETS_CACHE"]  = os.path.join(os.environ["HF_HOME"], "datasets")

cache_dir = os.environ["HF_HOME"]

data_files = {
    "train": "data/sentiment_data/train.csv",
    "validation": "data/sentiment_data/validation.csv",
    "test": "data/sentiment_data/test.csv",
}
ds = load_dataset(
    "csv",
    data_files=data_files,
    cache_dir="/home/m/mehrad/brikiyou/scratch/huggingface_cache/datasets"
)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, 
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #quantization_config=bnb_config,
    device_map="auto",
    #trust_remote_code=False
    id2label={i: l for i, l in enumerate(ds["train"].unique("label"))},
    label2id={l: i for i, l in enumerate(ds["train"].unique("label"))},

    torch_dtype=torch.bfloat16, # if A100 GPU Available     
    cache_dir=cache_dir
)

# Look at tokenizer - to adapt to dataset
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    trust_remote_code=True
)


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 30] Read-only file system: '/home/m/mehrad/brikiyou/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B'
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Prepare for Dora fine-tuning

In [7]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable() # comment out if saving vram

In [8]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [9]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj","o_proj","gate_proj","up_proj","down_proj", # To look for for self attention & mlp without act_fn
                    "lora_magintude_vector"], # DoRa 
    lora_dropout=0.01, # To tune
    bias="none",
    task_type="CAUSAL_LM",
    use_dora=True,
)


In [10]:
model = get_peft_model(model, peft_config)

## Tokenizer

In [11]:
os.environ["HF_METRICS_CACHE"]  = os.path.join(os.environ["HF_HOME"], "evaluate")
hub_model_id = "brikiyou/fpbank-deepseek-8b-sentiment-analysis"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    config=AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    cache_dir=cache_dir,
)
,
    trust_remote_code=True,
    cache_dir=cache_dir,
    use_fast=True,
)

if tokenizer.pad_token_id == tokenizer.eos_token_id:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

model.resize_token_embeddings(len(tokenizer))


def tokenize_batch(batch):
    out = tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    out["labels"] = batch["label"]
    return out

for split in ["train", "validation", "test"]:
    ds[split] = ds[split].map(
        tokenize_batch,
        batched=True,
        remove_columns=["text","label"],
        num_proc=4,
    )

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

training_args = TrainingArguments(
    output_dir=out_dir,
    do_train=True,
    do_eval=True,
    eval_steps=100,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    push_to_hub=False, # put true
    hub_model_id=hub_model_id,
    hub_token="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"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    #processing_class=DataCollatorWithPadding,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(out_dir)                
tokenizer.save_pretrained(out_dir)          
#trainer.push_to_hub()


Map (num_proc=4):   0%|          | 0/35519 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4217 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4846 [00:00<?, ? examples/s]

ValueError: Both `tokenizer` and `processing_class` are set for `Trainer.__init__`. Using `processing_class=<class 'transformers.data.data_collator.DataCollatorWithPadding'>` and ignoring deprecated `tokenizer=LlamaTokenizerFast(name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', vocab_size=151643, model_max_length=16384, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<｜User｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151645: AddedToken("<｜Assistant｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151646: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151647: AddedToken("<|EOT|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151648: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151649: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151665: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)`.