In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy #ipywidgets
!pip install wandb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━

In [1]:
import os
from datasets import load_dataset
import locale

from datetime import datetime

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from peft import LoraConfig, get_peft_model

locale.getpreferredencoding = lambda: "UTF-8"



def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target passaage or sentence find the topics and the topics are different words from the passage or sentence itself.

                  ### Target sentence:
                  {data_point["feedbacks"]}


                  ### Topics of the sentence:
                  {data_point["topics"]}
                  """
    return tokenize(full_prompt)

    
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )




fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

'/content'

In [2]:
train_dataset = load_dataset('csv', data_files = '/content/drive/MyDrive/mistral/data/laptope_mistral_train.csv')
train_dataset = train_dataset['train']
test_dataset = load_dataset('csv', data_files = '/content/drive/MyDrive/mistral/data/laptope_mistral_test.csv')
test_dataset = test_dataset['train']

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['feedbacks', 'topics'],
    num_rows: 2459
})
Dataset({
    features: ['feedbacks', 'topics'],
    num_rows: 433
})


In [4]:
base_model_id = "mistralai/Mistral-7B-v0.1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = test_dataset.map(generate_and_tokenize_prompt)

In [8]:
print(tokenized_train_dataset)

Dataset({
    features: ['feedbacks', 'topics', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2459
})

In [9]:
eval_prompt = """Given a target passaage or sentence find the topics and the topics are different words from the passage or sentence itself.


          ### Target sentence:
          Keyboard is working fine but mouse is not good.


          ### Topics of the sentence:
          """

In [11]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [13]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True
else:
  print('1 GPU')

1 GPU


In [18]:
project = "topic-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "/content/drive/MyDrive/mistral/" + run_name

tokenizer.pad_token = tokenizer.eos_token


trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=1000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=50,
        # bf16=True,
        fp16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=50,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=50,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        # report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33msupratimmannaml2[0m ([33msupratim[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
50,1.473,0.843431
100,0.8013,0.775326
150,0.7796,0.721911




KeyboardInterrupt: 

In [27]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    # use_auth_token=True,
    token='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'
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Topic Prediction

In [83]:
n=69
fdbck = tokenized_val_dataset['feedbacks'][n]
topic = tokenized_val_dataset['topics'][n]
print(fdbck)
print(topic)

Battery is lasting about 6 hours as I am surfing the web on Sundays while checking football scores and watching funny Youtube videos .
Battery, surfing the web


In [84]:
eval_prompt = """Given a target passaage or sentence find the topics and the topics are different words from the passage or sentence itself.


          ### Target sentence:
          Battery is lasting about 6 hours as I am surfing the web on Sundays while checking football scores and watching funny Youtube videos .


          ### Topics of the sentence:
          """

eval_prompt

'Given a target passaage or sentence find the topics and the topics are different words from the passage or sentence itself.\n\n\n          ### Target sentence:\n          Battery is lasting about 6 hours as I am surfing the web on Sundays while checking football scores and watching funny Youtube videos .\n\n\n          ### Topics of the sentence:\n          '

In [68]:
from peft import PeftModel
ft_model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/mistral/mistral-topic-finetune/checkpoint-150")

ft_model.eval()


model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=15, pad_token_id=2)[0], skip_special_tokens=True))

Given a target passaage or sentence find the topics and the topics are different words from the passage or sentence itself.


          ### Target sentence:
          I never had this kind of quality issue with Dell ( not to say Dell is that great ) , not with a brand new laptop .


          ### Topics of the sentence:
          1. Dell

