## Fine-tuning using dora

Model: deepseek
Dataset: financialPhraseBank - from finbert 
Techniques: dora


Loading data

In [10]:
%tb
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    LlamaTokenizerFast,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    default_data_collator,
AutoConfig,
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,

)
import torch

import csv
import os
import pandas as pd
data_dir = (
    "/home/m/mehrad/brikiyou/scratch/ift6289/IFT6289-project/data/"
    "FinancialPhraseBank-v1.0"
)
out_dir = "data/sentiment_data"

os.makedirs(out_dir, exist_ok=True)
train_files = [
    "Sentences_75Agree_utf8.txt",
    "Sentences_AllAgree_utf8.txt",
    #"augmented_Sentences_66Agree_utf8.txt",
    #"augmented_Sentences_AllAgree_utf8.txt",
    #"augmented_Sentences_75Agree_utf8.txt",
    #"Sentences_66Agree_utf8.txt",
    #"augmented_Sentences_66Agree_utf8.txt"
]

test_file = "Sentences_50Agree_utf8.txt"


def load_fpbank(path):
    return pd.read_csv(
        path,
        sep='@',
        engine='python',
        names=['text', 'sentiment'],
        on_bad_lines='skip'
    )


splits = {
    "train":   pd.concat([load_fpbank(os.path.join(data_dir, f))
                          for f in train_files], ignore_index=True),
    "test":    load_fpbank(os.path.join(data_dir, test_file)),
}

for name, df in splits.items():
    # select & order columns
    df = df[["sentiment", "text"]]
    # write clean comma-delimited CSV with no index
    df.to_csv(
        os.path.join(out_dir, f"{name}.csv"),
        index=False,
        quoting=csv.QUOTE_MINIMAL
    )
    print(f"Wrote {len(df)} {name} examples")


KeyboardInterrupt: 

Wrote 5717 train examples
Wrote 4846 test examples


### TRAINING STARTS HERE

In [2]:
from huggingface_hub import notebook_login

In [3]:
"""
import wandb
wandb.login()
"""

'\nimport wandb\nwandb.login()\n'

In [4]:
%env HF_HUB_ENABLE_HF_TRANSFER=True

env: HF_HUB_ENABLE_HF_TRANSFER=True


In [5]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
cache_dir = "/home/m/mehrad/brikiyou/scratch/huggingface_cache"

import os
os.environ["TRITON_CACHE_DIR"] = "/home/m/mehrad/brikiyou/scratch/triton_cache"
os.environ["TRITON_HOME"]      = "/home/m/mehrad/brikiyou/scratch/triton_home"
os.environ["CC"]  = "/scinet/balam/rocky9/software/2023a/opt/cuda-12.3.1/gcc/12.3.0/bin/gcc"
os.environ["CXX"] = "/scinet/balam/rocky9/software/2023a/opt/cuda-12.3.1/gcc/12.3.0/bin/g++"
os.environ["HF_HOME"]            = "/home/m/mehrad/brikiyou/scratch/huggingface_cache"
os.environ["HF_HUB_CACHE"]       = os.path.join(os.environ["HF_HOME"], "hub")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "models")
os.environ["HF_DATASETS_CACHE"]  = os.path.join(os.environ["HF_HOME"], "datasets")

cache_dir = os.environ["HF_HOME"]

data_files = {
    "train": "data/sentiment_data/train.csv",
    "test": "data/sentiment_data/test.csv",
}


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, 
)
config = AutoConfig.from_pretrained(
    model_id,
    trust_remote_code=True,
    num_labels=3,
    id2label={0: "negative", 1: "neutral", 2: "positive"},
    label2id={"negative": 0, "neutral": 1, "positive": 2},
    cache_dir=cache_dir,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    config=config,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    cache_dir=cache_dir
)

# Look at tokenizer - to adapt to dataset
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    trust_remote_code=True
)


Could not cache non-existence of file. Will ignore error and continue. Error: [Errno 30] Read-only file system: '/home/m/mehrad/brikiyou/.cache/huggingface/hub/models--deepseek-ai--DeepSeek-R1-Distill-Qwen-7B'
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Prepare for Dora fine-tuning

In [7]:
print(model)

Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
   

In [8]:
from peft import LoraConfig, TaskType, get_peft_model
import transformers
print(transformers.__version__)

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "q_proj","v_proj","k_proj","o_proj",
        "gate_proj","up_proj","down_proj",
        "lora_magnitude_vector"  # if using DoRa
    ],
    lora_dropout=0.01,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    use_dora=True,
)


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.save_pretrained(cache_dir)
model.save_pretrained(cache_dir)



4.51.3


KeyboardInterrupt: 

In [None]:
model = get_peft_model(model, peft_config)


### Fine-tune the model

In [None]:
from transformers import default_data_collator
import pandas as pd
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer

label2id = {"negative": 0, "neutral": 1, "positive": 2}
def encode_labels(example):
    return {"labels": label2id[example["labels"]]}

df_train = (
    pd.read_csv("data/sentiment_data/train.csv")
      .dropna(subset=["text", "sentiment"])
)
df_test = (
    pd.read_csv("data/sentiment_data/test.csv")
      .dropna(subset=["text", "sentiment"])
)

ds_train = Dataset.from_pandas(df_train, preserve_index=False)
ds_test  = Dataset.from_pandas(df_test,  preserve_index=False)

ds = DatasetDict({
    "train": ds_train,
    "test":  ds_test,
})

print(ds)
label2id = {"negative": 0, "neutral": 1, "positive": 2}

def preprocess_batch(batch):
    texts = [str(t) for t in batch["text"]]
    enc = tokenizer(
        text=texts,
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    enc["labels"] = [label2id[s] for s in batch["sentiment"]]
    return enc

for split in ["train","test"]:
    ds[split] = ds[split].map(
        preprocess_batch,
        batched=True,
        batch_size=500,
        remove_columns=["text", "sentiment"],
        num_proc=4,
    )
data_collator = default_data_collator


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": (preds == labels).mean()}

training_args = TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=200,
    learning_rate=5e-5,
    label_names=["labels"],          
    remove_unused_columns=False,     
    #evaluation_strategy="epoch",
    logging_strategy="epoch",  
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()
trainer.save_model(f"{out_dir}/model/")
tokenizer.save_pretrained(out_dir)
