In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install --upgrade peft trl
#!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git


In [None]:
!pip install -q datasets scipy protobuf #wandb
!pip install -q ipywidgets==7.7.1
!pip install sentencepiece

In [None]:
# Restart Kernel after installing packages

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!nvidia-smi

In [None]:

import torch
print(torch.version.cuda)

In [None]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

In [None]:
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [None]:
### FINE-TUNING DU MODELE

In [None]:
# On importe la liste de dictionnaires de conversations Sirene
import os, sys, s3fs
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
fs.ls("vlapegue")
bucket = 'vlapegue/train_sirene'
files = fs.ls(bucket)[-3:]
fs.download(files[2],'train_sirene.csv')

In [None]:
print(files)

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="train_sirene.csv",split='train')

In [None]:
print(dataset[0])
print(dataset[1])

In [None]:
# On transforme la chaîne de caractères de messages en liste, (ce qui modifie l'ordre des entrées, a priori sans importance)
import ast
dataset = dataset.map(lambda x: {"messages": ast.literal_eval(x['messages'])} )

In [None]:
print(dataset[0])
print(dataset[1])


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
base_model_id = "mistralai/Mistral-7B-Instruct-v0.3"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=1024,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Au cas où il faudrait appliquer un chat template à SFTTrainer (a priori pas nécessaire)
# tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
# print(tokenizer.decode(tokenized_chat[0]))


In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

In [None]:
inputs = tokenizer("Qui est l'actuel président de la République française ?", return_tensors="pt")
output_generate=model.generate(**inputs,max_new_tokens=20, return_dict_in_generate=True, output_scores=True)
transition_scores = model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
print(type(inputs))
print(inputs)

In [None]:
import numpy as np

input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]

generated_tokens = output_generate.sequences[:, input_length:]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability

    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
print(dataset[0])
print(generate_prompt(dataset[0]))

In [None]:
dataset = dataset.shuffle(seed=1234)

# Split the dataset into training and testing sets
train_test_split = dataset.train_test_split(test_size=0.05)  # test_size for testing
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

print(train_dataset)
print(test_dataset)

In [None]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model



In [None]:
model.gradient_checkpointing_enable()

In [None]:
print(model)

In [None]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

In [None]:
from peft import LoraConfig, get_peft_model

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model_peft = get_peft_model(model, lora_config)
model_peft = accelerator.prepare_model(model_peft)

In [None]:
trainable, total = model_peft.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

In [None]:
from datetime import datetime

project = "SFT-mistralai/Mistral-7B-Instruct-v0.3"
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

In [None]:
print(output_dir)

In [None]:
import transformers

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="messages",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        auto_find_batch_size=True,
        evaluation_strategy="no",
        do_eval=False,
        gradient_checkpointing=True,
        gradient_accumulation_steps=8,
        bf16=True,
        warmup_steps=2,
        max_steps=50,
        learning_rate=2e-4,
        logging_steps=10,
        output_dir="outputs",
        #eval_steps=10,
        optim="paged_adamw_8bit",
        save_strategy="epoch",
        report_to=None,
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    #data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:

# SAUVEGARDE DU MODELE

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
# On importe le modèle Peft sauvegardé précédemment
! mc cp s3/vlapegue/train_sirene/Mistral_Sirene_240718/Mistral_Sirene_240718 ./Mistral_Sirene_240718 --recursive

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    temperature=0,
    low_cpu_mem_usage=True,
    return_dict=True,
    #torch_dtype=torch.float16,
    #device_map={"": 0},
)


In [None]:
# Utilisation du modèle de base
messages = [{"role": "user", "content": "Qui est l'actuel président de la République française ?"}]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
output_generate=base_model.generate(model_inputs,max_new_tokens=80, temperature=0,return_dict_in_generate=True, output_scores=True,pad_token_id=tokenizer.eos_token_id)
transition_scores = base_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

input_length = 1 if base_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'tokens':tokens,'probas': probas})
df.to_csv("general_base_tokens.csv", sep=";")

In [None]:
# Utilisation du modèle de base pour une question Sirene

messages = [{"role": "user", "content": "J’aimerais obtenir mon avis de situation au répertoire pour avis et suis disponible si d’autres documents s’avéraient nécessaires."}]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
#inputs = tokenizer("J’aimerais obtenir mon avis de situation au répertoire pour avis et suis disponible si d’autres documents s’avéraient nécessaires.", return_tensors="pt")
output_generate=base_model.generate(model_inputs,max_new_tokens=80, temperature=0,return_dict_in_generate=True, output_scores=True,pad_token_id=tokenizer.eos_token_id)
transition_scores = base_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

input_length = 1 if base_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'tokens':tokens,'probas': probas})
df.to_csv("sirene_base_tokens.csv", sep=";")

In [None]:
new_model = "./Mistral_Sirene_240718"
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

In [None]:
# Utilisation du modèle fine-tuné

messages = [{"role": "user", "content": "Qui est l'actuel président de la République française ?"}]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
output_generate=merged_model.generate(model_inputs,max_new_tokens=80, temperature=0,return_dict_in_generate=True, output_scores=True,pad_token_id=tokenizer.eos_token_id)
transition_scores = merged_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

#input_length = 1 if merged_model.config.is_encoder_decoder else inputs.input_ids.shape[1]
input_length = 1 if merged_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'tokens':tokens,'probas': probas})
df.to_csv("general_merged_tokens.csv", sep=";")

In [None]:
# Utilisation du modèle fine-tuné pour une question Sirene

messages = [{"role": "user", "content": "J’aimerais obtenir mon avis de situation au répertoire pour avis et suis disponible si d’autres documents s’avéraient nécessaires."}]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
output_generate=merged_model.generate(model_inputs,max_new_tokens=80, temperature=0,return_dict_in_generate=True, output_scores=True,pad_token_id=tokenizer.eos_token_id)
transition_scores = merged_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

#input_length = 1 if merged_model.config.is_encoder_decoder else inputs.input_ids.shape[1]
input_length = 1 if merged_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
import numpy as np
import pandas as pd
df = pd.DataFrame({'tokens':tokens,'probas': probas})
df.to_csv("sirene_merged_tokens.csv", sep=";")

In [None]:
# Utilisation du modèle de base avec model_inputs
messages = [{"role": "user", "content": "Qui est l'actuel président de la République française ?"}]
model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
output_generate=base_model.generate(model_inputs,max_new_tokens=80, temperature=0,return_dict_in_generate=True, output_scores=True,pad_token_id=tokenizer.eos_token_id)
transition_scores = base_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

#input_length = 1 if base_model.config.is_encoder_decoder else inputs.input_ids.shape[1]
input_length = 1 if base_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

In [None]:
# Utilisation du modèle de base avec inputs

inputs = tokenizer("Qui est l'actuel président de la République française ?", return_tensors="pt")
output_generate=base_model.generate(**inputs,max_new_tokens=80, temperature=0, return_dict_in_generate=True, output_scores=True)
transition_scores = base_model.compute_transition_scores(output_generate.sequences, output_generate.scores, normalize_logits=True)

In [None]:
import numpy as np

input_length = 1 if base_model.config.is_encoder_decoder else inputs.input_ids.shape[1]
#input_length = 1 if base_model.config.is_encoder_decoder else model_inputs.size(dim=1)

generated_tokens = output_generate.sequences[:, input_length:]

tokens=[]
probas=[]

for tok, score in zip(generated_tokens[0], transition_scores[0]):

# | token | token string | log probability | probability
    tokens.append(tokenizer.decode(tok))
    probas.append(np.exp(score.numpy()))
    print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")