In [1]:
import os

os.environ['HF_HOME'] = "./hf/"

os.environ['WANDB_DISABLED'] = 'true'

os.environ['CUDA_VISIBLE_DEVICES'] = '0'



In [2]:
import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import BitsAndBytesConfig
import numpy as np
# import peft and Lora
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training




In [3]:

config = LoraConfig(
    r=20, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)


compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )



In [4]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBartTokenizer.from_pretrained(model_name , use_fast = True)
model = MBartForConditionalGeneration.from_pretrained(model_name)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


In [5]:
# Preparing the Model for QLoRA
model = prepare_model_for_kbit_training(model)

In [6]:
# peft model
model = get_peft_model(model, config)

In [7]:
from datasets import load_dataset

dataset = load_dataset("cfilt/iitb-english-hindi")

In [8]:
from datasets import Dataset
def generate_dataset(dataset , split):
    filtered_dataset = dataset[split]['translation']
    english_dataset = [data['en'] for data in filtered_dataset]
    hindi_dataset = [data['hi'] for data in filtered_dataset]
    dataset_size = min(30000 , len(english_dataset))


    print("Total Dataset length : " , len(english_dataset))
    print("Trimmed length :" , dataset_size)


    english_dataset = english_dataset[:dataset_size]
    hindi_dataset = hindi_dataset[:dataset_size]
    data_dictionary = {
        "english" : english_dataset,
        "hindi" : hindi_dataset
    }
    return Dataset.from_dict(data_dictionary)

In [9]:
train_dataset = generate_dataset(dataset, "train")
train_dataset

Total Dataset length :  1659083
Trimmed length : 30000


Dataset({
    features: ['english', 'hindi'],
    num_rows: 30000
})

In [10]:
test_dataset = generate_dataset(dataset , "test")
test_dataset

Total Dataset length :  2507
Trimmed length : 2507


Dataset({
    features: ['english', 'hindi'],
    num_rows: 2507
})

In [11]:
validation_dataset = generate_dataset(dataset , "validation")
validation_dataset

Total Dataset length :  520
Trimmed length : 520


Dataset({
    features: ['english', 'hindi'],
    num_rows: 520
})

In [12]:
# def tokenize_example(example , lang):
#     return tokenizer(example[lang], truncation=True)

# def tokenize_dataset(example):
#     english_tokens = tokenize_example(example, "english")
#     # english_tokens['english_tokens'] = english_tokens['input_ids']
#     english_tokens['english_attention_mask'] = english_tokens['attention_mask']
#     hindi_tokens = tokenize_example(example , "hindi")
#     english_tokens['labels'] = hindi_tokens['input_ids']
#     english_tokens['hindi_attention_mask'] = hindi_tokens['attention_mask']
#     return english_tokens



def tokenize_dataset_new(example):
    model_inputs = tokenizer(example["hindi"], max_length=512, truncation=True)
    labels = tokenizer(example["english"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [13]:
train_tokenised_dataset = train_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
train_tokenised_dataset = train_tokenised_dataset.remove_columns(['english' , 'hindi'])
train_tokenised_dataset

Map (num_proc=5):   0%|          | 0/30000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 30000
})

In [14]:
test_tokenised_dataset = test_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
test_tokenised_dataset = test_tokenised_dataset.remove_columns(['english' , 'hindi' ])
test_tokenised_dataset

Map (num_proc=5):   0%|          | 0/2507 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2507
})

In [15]:
validation_tokenised_dataset = validation_dataset.map(tokenize_dataset_new , batched=True , num_proc=5)
validation_tokenised_dataset = validation_tokenised_dataset.remove_columns(['english' , 'hindi'])
validation_tokenised_dataset

Map (num_proc=5):   0%|          | 0/520 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 520
})

In [16]:
from transformers import DataCollatorForSeq2Seq
data_collector = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [19]:
# from torch.utils.data import DataLoader

train_batch_size = 30
test_batch_size = 15
validation_batch_size = 10
# train_dataloader = DataLoader(train_tokenised_dataset , shuffle=True,
#                                 batch_size = train_batch_size,
#                                 collate_fn = data_collector
#                                 )

# test_dataloader = DataLoader(test_tokenised_dataset , shuffle=True,
#                                 batch_size = test_batch_size,
#                                 collate_fn = data_collector
#                                 )


# validation_dataloader = DataLoader(validation_tokenised_dataset , shuffle=True,
#                                 batch_size = validation_batch_size,
#                                 collate_fn = data_collector
#                                 )

# train_dataloader,test_dataloader, validation_dataloader

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:

training_args = Seq2SeqTrainingArguments(
    # training
    num_train_epochs=5,
    per_device_train_batch_size=train_batch_size,
    # per device training batch size is used to train the model on the given batch size

    # evaluation
    per_device_eval_batch_size=validation_batch_size,
    # per device evaluation batch size is used to evaluate the model on the given batch size
    # gradient_accumulation_steps=8,
    # gradient accumulation steps is used to accumulate the gradients over the given number of steps
    # this helps in reducing the memory usage during training
    # eval_accumulation_steps=10,
    # eval accumulation steps is used to accumulate the evaluation results over the given number of steps
    # this helps in reducing the memory usage during evaluation
    evaluation_strategy="steps",
    # if the evaluation strategy is steps, then the evaluation will be done every eval_steps
    # else if it is epoch, then the evaluation will be done every epoch and eval accumulation steps will be ignored
    eval_steps=500,


    # checkpointing


    # logging
    logging_dir="./logs",
    logging_steps=10,

    # misc
    warmup_steps=500,
    # warmup steps is used to warmup the learning rate over the given number of steps
    # this helps in reducing the impact of the randomness in the initial learning rate
    # this is very useful when the learning rate is very high
    # this is also useful when the model is very large
    output_dir="./output",
    save_steps=500,
    # save steps is used to save the model over the given number of steps
    # this is useful when the model is very large
    save_strategy="steps",
    # save strategy is used to save the model every epoch
    # if the save strategy is steps, then the model will be saved every save_steps
    # else if it is epoch, then the model will be saved every epoch
    # and save_steps will be ignored
    save_total_limit=4,

    # save the best model
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better = False,
    # generate tensorboard logs
    report_to=None,


)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [22]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=5)

In [23]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenised_dataset,
    eval_dataset=validation_tokenised_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks = [early_stopping]
    # compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
trainer.train()

Step,Training Loss,Validation Loss
500,1.3524,1.552265
1000,1.3337,1.540799
1500,1.0956,1.554042
2000,1.0502,1.564358
2500,0.9921,1.574079
3000,1.0181,1.576604
3500,0.8582,1.588758


TrainOutput(global_step=3500, training_loss=1.2506326999664306, metrics={'train_runtime': 746.5867, 'train_samples_per_second': 200.914, 'train_steps_per_second': 6.697, 'total_flos': 8527782800424960.0, 'train_loss': 1.2506326999664306, 'epoch': 3.5})

In [25]:
trainer.save_model("./mBART-fine-tuned-hi-en")

In [26]:
from transformers import pipeline

translator = pipeline("translation", model=model,
                    src_lang = "hi_IN" , tgt_lang= "en_XX",
                    tokenizer=tokenizer)


The model 'PeftModelForCausalLM' is not supported for translation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


In [27]:
translator("आप कैसे हैं")

[{'translation_text': 'How are you'}]