In [1]:
!pip install -q datasets --progress-bar off

In [2]:
import pandas as pd
import pickle

In [3]:
experiment_name = "zephyr-7b-instruct-4bit-qlora-fine-tuning"

In [4]:
from google.colab import drive
from IPython.display import Image, display

mount_point = "/content/gdrive"
base_path = mount_point + "/MyDrive/Colab/SDG/data"

input_data_path = base_path + "/input/sdg_17_labels_classification_dataset_1020_texts_TEST_2023.12.11.xlsx"
input_data_path_validation = base_path + "/input/sdg_17_labels_classification_dataset_4760_texts_DEV_2023.12.11.xlsx"
input_data_path_train = base_path + "/input/sdg_17_labels_classification_dataset_4760_texts_TRAIN_2023.12.11.xlsx"

# top7_train_data_sim_to_each_test_data_filename = base_path + "/input/top7_train_data_sim_to_each_test_data_2023.12.11.pickle"
# top7_train_data_sim_to_each_test_data_filename = base_path + "/input/top7_train_data_sim_to_each_test_data_but_with_different_classes_with_2023.12.11_input_data.pickle"

output_data_path = base_path + "/../Mistral/data/output/" + experiment_name
log_filename = base_path + "/../Mistral/logs/" + experiment_name + ".log"

drive.mount(mount_point, force_remount=True)

Mounted at /content/gdrive


In [5]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl

In [6]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch #, wandb
from datasets import load_dataset
from trl import SFTTrainer


In [7]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
# secret_wandb = user_secrets.get_secret("wandb")

In [8]:
# !huggingface-cli login --token $secret_hf

In [9]:
# # Monitering the LLM
# wandb.login(key = secret_wandb)
# run = wandb.init(
#     project='Fine tuning Zephyr 7B',
#     job_type="training",
#     anonymous="allow"
# )

In [10]:
base_model = "HuggingFaceH4/zephyr-7b-beta"
# dataset_name = "THUDM/AgentInstruct"
new_model = "zephyr-7b-beta-SDG-classification-finetuned"

In [11]:
data = data = pd.read_excel(input_data_path_train) #[['fine_tuning_prompt']].rename(columns={'fine_tuning_prompt': 'text'})
data.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length
0,0,2065,2944,,This Vitamin Reduces Mental Health Problems By...,0,,,,9
1,1,1325,1911,,'League Of Legends' unveils new Arena game mod...,0,,,,66
2,2,409,581,,Community remembers Maddi Kingsbury at public ...,0,,,,56


In [12]:
def apply_fine_tuning_template(data):
  text = data.text
  label = data.sdg

  template = f"""<|system|>\nYou are an helpful virtual assistant specialized in a multi-class single-label text classification task related to the domain of the Sustainable Development Goals.</s>\n<|user|>\nClassify the following input text within triple quotes according to the following Sustainable Development Goals (SDGs) dictionary of labels:

"SDG-1": "End poverty in all its forms everywhere."
"SDG-2": "End hunger, achieve food security and improved nutrition and promote sustainable agriculture."
"SDG-3": "Ensure healthy lives and promote well-being for all at all ages."
"SDG-4": "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all."
"SDG-5": "Achieve gender equality and empower all women and girls."
"SDG-6": "Ensure availability and sustainable management of water and sanitation for all."
"SDG-7": "Ensure access to affordable, reliable, sustainable and modern energy for all."
"SDG-8": "Promote sustained, inclusive and sustainable economic growth, full and productive employment and decent work for all."
"SDG-9": "Build resilient infrastructure, promote inclusive and sustainable industrialization and foster innovation."
"SDG-10": "Reduce inequality within and among countries."
"SDG-11": "Make cities and human settlements inclusive, safe, resilient and sustainable."
"SDG-12": "Ensure sustainable consumption and production patterns."
"SDG-13": "Take urgent action to combat climate change and its impacts."
"SDG-14": "Conserve and sustainably use the oceans, seas and marine resources for sustainable development."
"SDG-15": "Protect, restore and promote sustainable use of terrestrial ecosystems, sustainably manage forests, combat desertification, halt and reverse land degradation, and halt biodiversity loss."
"SDG-16": "Promote peaceful and inclusive societies for sustainable development, provide access to justice for all and build effective, accountable and inclusive institutions at all levels."
"SDG-0": "Other."

Choose ONLY ONE label for each input text.

DO NOT include the input text in your answer.

The input text is:
'''
{text}
'''</s>\n<|assistant|>\n"SDG-{label}"
"""

  return template


In [13]:
data['fine_tuning_prompt'] = data.apply(apply_fine_tuning_template, axis=1)

In [14]:
data.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text_id,doi,text,sdg,labels_negative,labels_positive,agreement,text_length,fine_tuning_prompt
0,0,2065,2944,,This Vitamin Reduces Mental Health Problems By...,0,,,,9,<|system|>\nYou are an helpful virtual assista...
1,1,1325,1911,,'League Of Legends' unveils new Arena game mod...,0,,,,66,<|system|>\nYou are an helpful virtual assista...
2,2,409,581,,Community remembers Maddi Kingsbury at public ...,0,,,,56,<|system|>\nYou are an helpful virtual assista...


In [15]:
data.iloc[0]['fine_tuning_prompt']

'<|system|>\nYou are an helpful virtual assistant specialized in a multi-class single-label text classification task related to the domain of the Sustainable Development Goals.</s>\n<|user|>\nClassify the following input text within triple quotes according to the following Sustainable Development Goals (SDGs) dictionary of labels:\n\n"SDG-1": "End poverty in all its forms everywhere."\n"SDG-2": "End hunger, achieve food security and improved nutrition and promote sustainable agriculture."\n"SDG-3": "Ensure healthy lives and promote well-being for all at all ages."\n"SDG-4": "Ensure inclusive and equitable quality education and promote lifelong learning opportunities for all."\n"SDG-5": "Achieve gender equality and empower all women and girls."\n"SDG-6": "Ensure availability and sustainable management of water and sanitation for all."\n"SDG-7": "Ensure access to affordable, reliable, sustainable and modern energy for all."\n"SDG-8": "Promote sustained, inclusive and sustainable economic

In [16]:
from datasets import Dataset

# Funzione per convertire il DataFrame in un formato compatibile con load_dataset
def convert_to_dataset(df):
    dataset_dict = {}
    for column in df.columns:
        dataset_dict[column] = df[column].tolist()

    dataset = Dataset.from_dict(dataset_dict)
    return dataset

In [17]:
dataset = convert_to_dataset(data[['fine_tuning_prompt']])

In [18]:
dataset

Dataset({
    features: ['fine_tuning_prompt'],
    num_rows: 4760
})

In [19]:
#Importing the dataset
# dataset = load_dataset("THUDM/AgentInstruct", split="train")

In [20]:
# def format_prompt(sample):
#     intro = "Below is a conversation between a user and you."
#     end = "Instruction: Write a response appropriate to the conversation."

#     try:
#         formatted_conversations = "\n".join(
#             f"<{resp['from']}>: {resp['value']}"
#             for resp in sample["conversations"]
#         )

#         sample["text"] = f"{intro}\n\n{formatted_conversations}\n\n{end}"
#     except (TypeError, KeyError):
#         raise ValueError("Invalid format of the input sample.")
#     return sample

In [21]:
# dataset = dataset.map(
#     format_prompt,
#     remove_columns=["conversations"]
# )
# dataset["text"][100]

In [22]:
# Load base model(Zephyr-7B 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        #load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

(True, True)

In [23]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'base_layer', 'down_proj']
)
model = get_peft_model(model, peft_config)

In [24]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=14, #4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    #report_to="wandb"
)


In [25]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= 2048,
    dataset_text_field="fine_tuning_prompt",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/4760 [00:00<?, ? examples/s]

In [26]:
trainer.train()



Step,Training Loss


KeyboardInterrupt: 

In [None]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
# wandb.finish()
model.config.use_cache = True
model.eval()

In [None]:
# trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [None]:
# logging.set_verbosity(logging.CRITICAL)

# prompt = "How to use Python online with DataCamp?"
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(prompt)
# print(result[0]['generated_text'])

In [None]:
# prompt = "What is Datacamp Career track?"
# result = pipe(prompt)
# print(result[0]['generated_text'])