## Setup

We will need  libraries such as `hugging_face`, `transformers`,`accelerate`, `peft`, `datasets` and `TRL` to use the `SFTTrainer`. We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).We will also install einops as it is a requirement to load the models.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install -q huggingface_hub
!pip install -q -U trl transformers accelerate peft
!pip install -q -U datasets bitsandbytes einops wandb
# Uncomment to install new features that support latest models like Llama 2
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m3.4

## Dataset

In [3]:
# # When prompted, paste the HF access token you created earlier.
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
dataset_name = "Tngarg/tamil_english"
dataset = load_dataset(dataset_name)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.50M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/871k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/867k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
dataset['train'][1]

{'Unnamed: 0': 30702,
 'tweet': 'Movie Miru Mari irukum....pathala therithu🤣 manga mandingilla🤣🤣',
 'sentiment': 'negative',
 'sentiment_en': 0}

In [37]:
# Make Prompts
from datasets import Dataset
dataset = dataset.map(lambda example:{'prompt' : str('Input : '+ example['tweet']+ ' \n\n### sentiment : '+ example['sentiment'])})

Map:   0%|          | 0/29344 [00:00<?, ? examples/s]

Map:   0%|          | 0/7337 [00:00<?, ? examples/s]

Map:   0%|          | 0/7337 [00:00<?, ? examples/s]

In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'tweet', 'sentiment', 'sentiment_en', 'prompt'],
        num_rows: 29344
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'tweet', 'sentiment', 'sentiment_en', 'prompt'],
        num_rows: 7337
    })
    test: Dataset({
        features: ['Unnamed: 0', 'tweet', 'sentiment', 'sentiment_en', 'prompt'],
        num_rows: 7337
    })
})

In [39]:
dataset['train'][1]

{'Unnamed: 0': 30702,
 'tweet': 'Movie Miru Mari irukum....pathala therithu🤣 manga mandingilla🤣🤣',
 'sentiment': 'negative',
 'sentiment_en': 0,
 'prompt': 'Input : Movie Miru Mari irukum....pathala therithu🤣 manga mandingilla🤣🤣 \n\n### sentiment : negative'}

## Loading the Model and tokenizer

In [123]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, AutoModel

model_name = "meta-llama/Llama-2-7b-hf"

In [124]:
# function to load model and tokenizer
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
    #     device_map = {"": 0}, # dispatch efficiently the model on the available ressources
    #     max_memory = {i: max_memory for i in range(n_gpus)},
    )

    # model.config.use_cache = False

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [125]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    return bnb_config

In [127]:
bnb_config = create_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



## Training : Configuring PEFT, Lora, Trainer

In [128]:
from peft import LoraConfig

def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

# peft_config = create_peft_config()

In [141]:
from transformers import TrainingArguments

output_dir = "outputs"
per_device_train_batch_size = 1
gradient_accumulation_steps = 4
optim = "paged_adamw_8bit"
save_steps = 1
num_train_epochs = 4
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 20
warmup_ratio = 0.03
warmup_steps=2,
lr_scheduler_type = "linear"


In [142]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    warmup_steps=2,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    # use_reentrant=True
)

In [143]:
import bitsandbytes as bnb
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [158]:
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
# 2 - Using the prepare_model_for_kbit_training method from PEFT
model = prepare_model_for_kbit_training(model)
# Get lora module names
modules = find_all_linear_names(model)
print(modules)
# Create PEFT config for these modules and wrap the model to PEFT
peft_config = create_peft_config(modules)


['base_layer']


In [145]:
model = get_peft_model(model, peft_config)

In [146]:
from trl import SFTTrainer

In [147]:
max_seq_length = 2048

dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes: dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items(): total+= v
for k, v in dtypes.items():
    print(k, v, v/total)

do_train = True

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)


torch.float32 342364160 0.09562264833036462
torch.uint8 3238002688 0.9043773516696354


Map:   0%|          | 0/29344 [00:00<?, ? examples/s]

In [148]:
# for name, module in trainer.model.named_modules():
#     if "norm" in name:
#         module = module.to(torch.float32)

In [149]:
do_train = True

# Launch training
print("Training...")

if do_train:
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)


Training...




Step,Training Loss
1,1.3955
2,1.2661
3,1.133
4,0.9057
5,1.579
6,1.6458
7,0.9673
8,3.0278
9,2.0933
10,1.9527




***** train metrics *****
  epoch                    =        0.0
  total_flos               =   255075GF
  train_loss               =     2.4292
  train_runtime            = 0:06:19.09
  train_samples_per_second =      0.211
  train_steps_per_second   =      0.053
{'train_runtime': 379.0935, 'train_samples_per_second': 0.211, 'train_steps_per_second': 0.053, 'total_flos': 273884957884416.0, 'train_loss': 2.4291872382164, 'epoch': 0.0}


## Testing

In [152]:
dataset['train']['prompt'][6]

'Input : Maarana Maass Petta thalaivaa thalaivaa thalaivaa... \n\n### sentiment : positive'

In [153]:
device = "cuda:0"
text = dataset['train']['tweet'][6]
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



Maarana Maass Petta thalaivaa thalaivaa thalaivaa... 

### sentiment : positive

### tag : positive

thalaivaa thalaivaa thalaivaa... 

### sentiment : positive

### tag : positive

thalaiva


In [None]:
def output_function(sample):
    inputs = tokenizer(sample,return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50 )
    # outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=50 )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

import pandas as pd
import swifter
df = pd.DataFrame(dataset['test'])
df = df.head(300)
df['output'] = df['tweet'].swifter.apply(output_function)
df.to_csv('outut.csv')

## Saving the Model

In [151]:
import os
output_dir = "/content/drive/MyDrive/Slovenia/Final Datasets/Lamma_results/tamil"
# Saving model
print("Saving last checkpoint of the model...")
os.makedirs(output_dir, exist_ok=True)
trainer.model.save_pretrained(output_dir)

Saving last checkpoint of the model...


In [120]:
# from huggingface_hub import login
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [139]:
# model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
# model_to_save.save_pretrained("outputs")
# lora_config = LoraConfig.from_pretrained('outputs')
# model.push_to_hub("Tngarg/lamma2_tamil_english",create_pr=1)

In [159]:
# model_dir = "/content/drive/MyDrive/Slovenia/Final Datasets/Lamma_results/tamil"
# from peft import PeftModel, PeftConfig
# from transformers import AutoModel

# config = PeftConfig.from_pretrained("/content/drive/MyDrive/Slovenia/Final Datasets/Lamma_results/tamil")
# model = AutoModel.from_pretrained("/content/drive/MyDrive/Slovenia/Final Datasets/Lamma_results/tamil")
# model = PeftModel.from_pretrained(model, "/content/drive/MyDrive/Slovenia/Final Datasets/Lamma_results/tamil")