<a href="https://colab.research.google.com/github/superasymmetry/Desktop-Helper-app/blob/main/FineTuning_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install peft
!pip install evaluate
!pip install -U bitsandbytes



In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

# dataset

In [3]:
# sst2
# The Stanford Sentiment Treebank consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence. It uses the two-way (positive/negative) class split, with only sentence-level labels.
# dataset = load_dataset("glue", "sst2")
dataset = load_dataset("SuperAGI/GUIDE", streaming = True)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/33 [00:00<?, ?it/s]

IterableDatasetDict({
    train: IterableDataset({
        features: ['previousAction', 'previousActionHistory', 'workflow', 'question', 'answer', 'cot', 'image'],
        num_shards: 33
    })
})

In [4]:
# display % of training data with label=1
# np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

NotImplementedError: Subclasses of Dataset should implement __getitem__.

# model

In [None]:
# model_checkpoint = 'roberta-base'

# # define label maps
# id2label = {0: "Negative", 1: "Positive"}
# label2id = {"Negative":0, "Positive":1}

# # generate classification model from model_checkpoint
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes

# model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-vision-128k-instruct', device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation="eager")

# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-vision-128k-instruct")

model_checkpoint = 'microsoft/Phi-3-vision-128k-instruct'
model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-vision-128k-instruct', device_map="auto", trust_remote_code=True, torch_dtype="auto", _attn_implementation="eager")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# display architecture
model

# preprocess data

In [None]:
# create tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# Function to process the text features and tokenize
def tokenize_function(examples):
    # Combine all textual features into one string (adjust as necessary based on your specific needs)
    combined_text = (
        examples.get(list(("previousAction", ""))) + [" "] +
        examples.get(list("previousActionHistory", "")) + [" "] +
        examples.get(list("workflow", "")) + [" "] +
        examples.get(list("question", "")) + [" "] +
        examples.get(list("answer", "")) + [" "] +
        examples.get(list("cot", ""))
    )

    # Tokenize and truncate the combined text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        combined_text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    # Process the image column separately (ensure proper handling)
    # You may need a separate image preprocessing step here based on how the image column is represented
    if "image" in examples:
        image = examples["image"]
        # Assuming your model can process images via CLIPVisionModel, add image processing here
        # Example (modify depending on the actual image format in your dataset):
        image_features = model.model.vision_embed_tokens.img_processor(image)
        tokenized_inputs['image'] = image_features

    return tokenized_inputs

In [None]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

In [None]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# evaluation

In [None]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [None]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

# Apply untrained model to text

In [None]:
# define list of examples
text_list = ["a feel-good picture in the best sense of the term .", "resourceful and ingenious entertainment .", "it 's just incredibly dull .", "the movie 's biggest offense is its complete and utter lack of tension .",
             "impresses you with its open-endedness and surprises .", "unless you are in dire need of a diesel fix , there is no real reason to see it ."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

# Train model

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules=["qkv_proj"],)

In [None]:
peft_config

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# hyperparameters
lr = 1e-3
batch_size = 16
num_epochs = 5

In [None]:
# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="no",
    save_strategy="no",
    load_best_model_at_end=True,
    report_to="none",
    max_steps = 32
)

In [None]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

# Generate prediction

In [None]:
model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])