In [7]:
'''

Yanis Bouchilloux                              24/08/2024

This file is a lightened version of the file "train_model"
In this one, all cells can be executed (in the linear order).
You can read the descriptions of the cells to know more about their roles.

'''

'\n\nFrom Yanis Bouchilloux, the 24/08/2024\n\nThis file is a lightened version of the file "train_model"\nIn this one, all cells can be executed (in the linear order).\nYou can read the descriptions of the cells to know more about their roles.\n\n'

In [4]:
#Import all the libraries needed in this Notebook

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import Dataset
from trl import SFTTrainer
import torch
import json

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "train-model-phi-3-mini-4k"
DATASET_NAME = "macadeliccc/opus_samantha"

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16

In [None]:
#Load the model and the tokenizer from phi-3

model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
#Load the local datas and make it a Dataset object (need the file qa_dataset.json)

with open('qa_dataset.json', 'r') as f:
    data = json.load(f)

def formatting_prompts_func(example):
    return {"Content" : f"Question: {example['question']}\nAnswer: {example['answer']}"}

formatted_list = [formatting_prompts_func(item) for item in data]

#print(formatted_dataset)

dataset = Dataset.from_list(formatted_list)

In [None]:
#Tokenize the dataset and seperate it in the eval_dataset and the train_dataset

def tokenize_function(examples):
    return tokenizer(examples["Content"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=4)

shuffled_tokenized_dataset = tokenized_dataset.shuffle()

eval_dataset = shuffled_tokenized_dataset.select(range(len(shuffled_tokenized_dataset) // 10))  # 10% pour l'évaluation
train_dataset = shuffled_tokenized_dataset.select(range(len(shuffled_tokenized_dataset) // 10, len(shuffled_tokenized_dataset)))

In [None]:
#Define the options for the fine-tuning

args = TrainingArguments(
eval_strategy="steps",
per_device_train_batch_size=7,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
learning_rate=1e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
max_steps=-1,
num_train_epochs=3,
save_strategy="epoch",
logging_steps=4,
output_dir=NEW_MODEL_NAME,
optim="paged_adamw_32bit",
lr_scheduler_type="linear")

In [None]:
#Train of the model

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    args = args
)
trainer.train()

In [None]:
#Generate local model

model_checkpoint = "./"+NEW_MODEL_NAME+"/checkpoint-48"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint,
                                             trust_remote_code=True,
                                             torch_dtype="auto")

In [None]:
#Test of the model

prompt = "What is SDGs?"

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, 
    max_length=100, 
    min_length=50, 
    repetition_penalty=2.0
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)