### TinyLLAMA Fine Tuning-

#### TinyLlama is a smaller model which has 1.1 Billion parameters & trained on 3 Trillion tokens, it has same architecture and tokenizer as Llama-2

#### Fine Tuning TinyLlama on a custom dataset for sentiment analysis

In [3]:
# Specific versions needs to be installed
!pip install accelerate==0.26.1 peft==0.7.1 bitsandbytes==0.42.0 transformers==4.35.2 trl==0.7.10 datasets==2.16.1 

Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl.metadata (18 kB)
Collecting peft==0.7.1
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Collecting bitsandbytes==0.42.0
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting transformers==4.35.2
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting trl==0.7.10
  Downloading trl-0.7.10-py3-none-any.whl.metadata (10 kB)
Collecting datasets==2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting tyro>=0.5.11 (from trl==0.7.10)
  Downloading tyro-0.8.2-py3-none-any.whl.metadata (7.9 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.1)
  Downloading dill-0.3.7-p

In [4]:
# imports 
from datasets import load_dataset
import torch
from peft import LoraConfig,PeftModel
from trl import SFTTrainer
import os
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,TrainingArguments,GenerationConfig

#### Dataset handling-

In [5]:
data=load_dataset('dair-ai/emotion','split')
data

Downloading data:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/127k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [6]:
data['train'][9]

{'text': 'i feel romantic too', 'label': 2}

#### Need to convert numerical labels to alphabetical

In [8]:
idx2label=['sadness','joy','love','anger','fear','surprise']

#### Preprocessing the dataset -
* Each LLM expects data in a specific format
* So here for TinyLlama converting the data to that specific format

In [9]:
PROMPT="Identify the sentiment in the given sentence."
def process_data_llm_format(sent):
    sent['text']=f"<|im_start|>user\n{PROMPT} {sent['text']} <|im_end|>\n<|im_start|>assistant\n{idx2label[sent['label']]}<|im_end|>"
    return sent

train_data=data['train'].map(process_data_llm_format,remove_columns=['label'])
valid_data=data['validation'].map(process_data_llm_format,remove_columns=['label'])
print(train_data[9]['text'])

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

<|im_start|>user
Identify the sentiment in the given sentence. i feel romantic too <|im_end|>
<|im_start|>assistant
love<|im_end|>


#### Setting tokenizer,models,bitsandbytes,PEFT config

In [11]:
model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer=AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token=tokenizer.eos_token
# peft config
peft_config=LoraConfig(r=8,lora_alpha=16,lora_dropout=0.05,bias="none",task_type="CAUSAL_LM")
# bitsandbytes config
bnb_config=BitsAndBytesConfig(load_in_4bit=True,
                             bnb_4bit_quant_type="nf4",
                             bnb_4bit_compute_dtype="float16",
                             bnb_4bit_use_double_quant=True)
# loading model using configurations
model=AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config,device_map="auto")
model.config.use_cache=False
model.config.pretraining_tp=1



tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#### Setting up trainer using SFTTrainer and Peft config

In [13]:
training_arguments=TrainingArguments(output_dir="./logs",
                                    per_device_train_batch_size=16,gradient_accumulation_steps=4,
                                    optim="paged_adamw_32bit",learning_rate=2e-4,lr_scheduler_type="cosine",
                                    save_strategy="epoch",logging_steps=250,max_steps=750,fp16=True)
trainer=SFTTrainer(model=model,train_dataset=train_data,eval_dataset=valid_data,peft_config=peft_config,
                                    dataset_text_field="text",args=training_arguments,
                                    tokenizer=tokenizer,packing=False,max_seq_length=1024)


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
finetuned_model_id="./tinyllama-finetuned"
trainer.train()
trainer.model.save_pretrained(finetuned_model_id)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
250,1.2372
500,1.0762
750,1.0676


#### Merge the Lora with base model

In [15]:
pretrained_model=AutoModelForCausalLM.from_pretrained(model_name,torch_dtype=torch.float16,
                                                     load_in_8bit=False,device_map="auto",
                                                     trust_remote_code=True)
peft_model=PeftModel.from_pretrained(pretrained_model,finetuned_model_id,
                                    from_transformers=True,device_map="auto")
model=peft_model.merge_and_unload()

#### Inferencing with this model-

In [16]:
generation_config = GenerationConfig(penalty_alpha=0.6,do_sample=True,top_k=5,temperature=0.5,repetition_penalty=1.2, 
    max_new_tokens=32,pad_token_id=tokenizer.eos_token_id)
def generate_response(prompt):
    inputs=tokenizer(prompt,return_tensors="pt").to('cuda')
    outputs=model.generate(**inputs,generation_config=generation_config)
    generated_response=tokenizer.decode(outputs[0],skip_special_tokens=True)
    end_idx=generated_response.index('<|im_end|>',len(prompt))+len('<|im_end|>')
    return generated_response[:end_idx]

In [17]:
def prepare_prompt_chatml_format(sample):
    sample['prompt']=f"<|im_start|>user\n{PROMPT}{sample['text']}<|im_end|>\n<|im_start|>assistant\n"
    return sample
test_data=data['test'].map(prepare_prompt_chatml_format)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [21]:
sample_prompt=test_data[0]['prompt']
print(generate_response(sample_prompt))

<|im_start|>user
Identify the sentiment in the given sentence.im feeling rather rotten so im not very ambitious right now<|im_end|>
<|im_start|>assistant
sadness<|im_end|>
