<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Mistral-Fine-Tuning-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning Mistral 7B for adaptive MT

# Installations

In [None]:
# Check GPU
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-40GB (UUID: GPU-c4241e29-1730-287f-4348-83d0f5e2e534)


In [None]:
!pip3 install datasets transformers accelerate bitsandbytes peft trl -q

# Loading the data

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
# Load the training dataset

source_train_file = "all-filtered.es.real.smalltrain"
target_train_file = "all-filtered.en.real.smalltrain"

with open(source_train_file, encoding="utf-8") as source, open(target_train_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

El consumo nocivo de alcohol es responsable por cerca de 3% de todas las muertes que ocurren en el planeta, incluyendo desde cirrosis y cáncer hepático hasta accidentes, caídas, intoxicaciones y homicidios.
The harmful use of alcohol is responsible for about 3% of all deaths that occur on the planet, ranging from liver cancer and cirrhosis to accidents, falls, poisoning and murder.


In [None]:
# Load the fuzzy matches from unique Context Dataset

context_train_file = "all-filtered.esen.ms-multi-12.online.smalltrain"

with open(context_train_file, encoding="utf-8") as context:
  lines = [line.strip().split(" ||| ") for line in context.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

n = 0
print(fuzzy_source_sentences[n])
print(online_source_sentences[n])
print(fuzzy_target_prefixes[n])

# Create the prompts

In [None]:
# Function to create zero-shot and one-shot prompts

def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  new_targets,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src, new_tgt in zip(fuzzy_sources, fuzzy_targets, new_sources, new_targets):
      fuzzy_src = source_lang + ": " + fuzzy_src
      fuzzy_tgt = target_lang + ": " + fuzzy_tgt
      new_src = source_lang + ": " + new_src
      src_segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src + "\n" + target_lang + ":"
      prompt = src_segment + " " + new_tgt
      prompt_dict = {"prompt": src_segment,
                     "response": new_tgt,
                     "source": "Medical"
                    }
      prompts.append(prompt)
  else:
    for new_src, new_tgt in zip(new_sources, new_targets):
      new_src = source_lang + ": " + new_src
      src_segment = new_src + "\n" + target_lang + ":"
      prompt = src_segment + " " + new_tgt
      prompt_dict = {"prompt": src_segment,
                     "response": new_tgt,
                     "source": "Medical"
                    }
      prompts.append(prompt)

  return prompts

In [None]:
source_lang = "Spanish"
target_lang = "English"

In [None]:
prompts_zero_shot = create_prompt(source_lang,
                                  target_lang,
                                  fuzzy_source_sentences,
                                  fuzzy_target_prefixes,
                                  online_source_sentences,
                                  target_sentences,
                                  one_shot=False
                                  )

prompts_one_shot = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        target_sentences,
                        one_shot=True
                        )


prompts = prompts_zero_shot + prompts_one_shot

print(len(prompts))

20000


In [None]:
print(prompts[0], "\n")
print(prompts[-1])

Spanish: El consumo nocivo de alcohol es responsable por cerca de 3% de todas las muertes que ocurren en el planeta, incluyendo desde cirrosis y cáncer hepático hasta accidentes, caídas, intoxicaciones y homicidios.
English: The harmful use of alcohol is responsible for about 3% of all deaths that occur on the planet, ranging from liver cancer and cirrhosis to accidents, falls, poisoning and murder. 

Spanish: Aceite de ricino, hidrogenado
English: Castor oil, hydrogenated
Spanish: Amyvid contiene etanol y sodio
English: Amyvid contains ethanol and sodium


In [None]:
# Shuffle the prompts
import random
random.shuffle(prompts)

print(prompts[0], "\n")
print(prompts[-1])

Spanish: NE: no estudiado.
English: NS: not studied. 

Spanish: MSTW 30 mg Krugmann 30 mg
English: MSTW 30 mg Krugmann


# Fine-tuning with Huggingface

# Create the dataset

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

dataset = DatasetDict({
    "train": Dataset.from_dict({"text": prompts[:19000]}),
    "validation": Dataset.from_dict({"text": prompts[19000:]})
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [None]:
from pprint import pprint

pprint(dataset['train'][4])

{'text': 'Spanish: El presente estudio, por tanto, tiene como objetivo adaptar '
         'y validar para el uso en el ámbito de la APS, por medio de consenso '
         'de especialistas, un conjunto de indicadores para evaluación del '
         'procesamiento de artículos médicos odontológicos hospitalarios.\n'
         "English: Therefore, this study's objective was to adapt and validate "
         'a set of indicators to assess the sterilization processing of dental '
         'and medical articles through expert consensus, to be used within PHC '
         'services.'}


## Load the model

In [None]:
import os
from google.colab import userdata

shared_drive = userdata.get("shared_drive")

model_directory = os.path.join(shared_drive, "models")

os.chdir(model_directory)
os.getcwd()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
import torch

cache_dir = model_directory

model_name = "mistralai/Mistral-7B-v0.1"

nf4_config = BitsAndBytesConfig(
                                load_in_4bit=True,
                                bnb_4bit_quant_type="nf4",
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.bfloat16
                                )

model = AutoModelForCausalLM.from_pretrained(
                                            model_name,
                                            device_map='auto',
                                            quantization_config=nf4_config,
                                            use_cache=False,
                                            cache_dir=cache_dir
                                            )

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir=cache_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

In [None]:
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training

peft_config = LoraConfig(
                        lora_alpha=16,
                        lora_dropout=0.1,
                        r=64,
                        bias="none",
                        task_type="CAUSAL_LM"
                        )

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

## Train the model

In [None]:
# model.gradient_checkpointing_enable()

In [None]:
output_directory = "mistral_finetuning_v1__"

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
                                  output_dir = output_directory,
                                  num_train_epochs=1,
                                  #max_steps = 594, # comment out this line if you want to train in epochs
                                  per_device_train_batch_size = 32,
                                  per_device_eval_batch_size = 32,
                                  warmup_steps = 0.03,
                                  logging_steps=20,
                                  save_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  #evaluation_strategy="steps",
                                  #eval_steps=20, # comment out this line if you want to evaluate at the end of each epoch
                                  #eval_accumulation_steps=4,
                                  learning_rate=2e-3,  # 2e-4 # lower LE for smaller batch sizes
                                  bf16=True,
                                  lr_scheduler_type='constant',
                                )

In [None]:
from trl import SFTTrainer

max_seq_length = 512  # increase if needed

trainer = SFTTrainer(
                    model=model,
                    peft_config=peft_config,
                    max_seq_length=max_seq_length,
                    tokenizer=tokenizer,
                    packing=True,
                    dataset_text_field="text",
                    args=training_args,
                    train_dataset=dataset["train"],
                    eval_dataset=dataset["validation"],
                  )

In [None]:
trainer.train()

# Test generation with Hugging Face

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import os


peft_model_path = os.path.join(output_directory, "checkpoint-121")  # change checkpoint path

peftconfig = PeftConfig.from_pretrained(peft_model_path)

model_base = AutoModelForCausalLM.from_pretrained(peftconfig.base_model_name_or_path,
                                             device_map = "auto",
                                             cache_dir = model_directory
                                            )

tokenizer = AutoTokenizer.from_pretrained(peftconfig.base_model_name_or_path)

new_model = PeftModel.from_pretrained(model_base, peft_model_path)

print("Peft model loaded")

In [None]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=20,
                                 min_new_tokens=1,
                                 do_sample=False,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)

  return decoded_output[0].replace(prompt, "")

In [None]:
prompt = """Spanish: Período de validez después de abierto el envase: 10 horas.
English:"""

In [None]:
generate_response(prompt, new_model)

'<s>  Period after opening the container: 10 hours.\n\nSpanish: Período de'

# Convert the fine-tuned model to CTranslate2

* https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/Convert-Mistral-Finetuned-CTranslate2.ipynb