In [None]:
'''

Yanis Bouchilloux                              29/08/2024

This file is the complete version of the file "train_model"
Not all cells have to be executed : you have three choices of data loading.
Read the descriptions of the cells to know thoses you want to run.

'''

In [1]:
#Import all needed libraries needed in this Notebook

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline
from datasets import load_dataset, Dataset, concatenate_datasets
from trl import SFTTrainer
import torch
import json
!pip install pypdf
from pypdf import PdfReader 
!pip install spacy
!python -m spacy download en_core_web_sm
import spacy


MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "train-model-phi-3-mini-4k"
DATASET_NAME = "macadeliccc/opus_samantha"
LOCAL_FILE_NAME = "The Project Gutenberg eBook of Psychology of the Unconscious"

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16

  from .autonotebook import tqdm as notebook_tqdm


[0mCollecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
#Load the model and the tokenizer from phi-3

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.29s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
#Load datas solution 1/3 : load datas from Hugging Face

dataset = load_dataset(DATASET_NAME, split="train")

EOS_TOKEN=tokenizer.eos_token_id

# Select a subset of the data for faster processing
dataset = dataset.select(range(500))

In [3]:
#Load datas solution 2/3 : load datas from local json (need the file qa_dataset.json)

with open('qa_dataset.json', 'r') as f:
    data = json.load(f)

def formatting_prompts_func(example):
    return {"conversations" : [{"role": "user", "value" : example['question']},{"role": "assistant", "value" : example['answer']}]}

formatted_dataset = [formatting_prompts_func(item)for item in data]

dataset = Dataset.from_list(formatted_dataset)

In [3]:
#Load datas solution 3/3 : create datas from local pdf (need the file "The Project Gutenberg eBook of Psychology of the Unconscious.pdf")

formatted_dataset = []

nlp = spacy.load("en_core_web_sm")

reader = PdfReader(LOCAL_FILE_NAME + '.pdf') 
pages = reader.pages[:260]
s = 0
for page in pages:
    convList = {"conversations":[]}
    doc = nlp(page.extract_text().replace("\n", ""))
    sentences = list(doc.sents)
    s+=len(sentences)
    prompt = True
    for sentence in sentences:
        sentence = str(sentence).strip()
        if prompt:
            convList["conversations"].append({"role" : "user", "value" : sentence})
        else:
            convList["conversations"].append({"role" : "assistant", "value" : "According to " + LOCAL_FILE_NAME + " : " + sentence})
        prompt = not prompt
    '''
    paragraphes = page.extract_text().split(r'\n\s*\n',)
    s+=len(paragraphes)
    for ele in paragraphes :
        ele = ele.strip()
        if not ele:
            continue
        if prompt:
            convList["conversations"].append({"role" : "user", "value" : ele})
        else:
            convList["conversations"].append({"role" : "assistant", "value" : ele})
        prompt = not prompt
    '''
    formatted_dataset.append(convList)

print(s)
dataset = Dataset.from_list(formatted_dataset)

5919


In [4]:
#Tokenize the dataset and seperate it in the eval_dataset and the train_dataset

def tokenize_function(examples):
    # Combiner tous les messages dans une seule séquence
    combined_texts = [" ".join([message['value'] for message in conversation]) for conversation in examples['conversations']]
    # Tokeniser les séquences combinées
    return tokenizer(combined_texts, padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=4)

shuffled_tokenized_dataset = tokenized_dataset.shuffle()

eval_dataset = shuffled_tokenized_dataset.select(range(len(shuffled_tokenized_dataset) // 10))
train_dataset = shuffled_tokenized_dataset.select(range(len(shuffled_tokenized_dataset) // 10, len(shuffled_tokenized_dataset)))

Map: 100%|██████████| 260/260 [00:00<00:00, 652.37 examples/s]


In [5]:
#Define the options for the fine-tuning

args = TrainingArguments(
eval_strategy="steps",
per_device_train_batch_size=7,
gradient_accumulation_steps=4,
gradient_checkpointing=True,
learning_rate=1e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
max_steps=-1,
num_train_epochs=3,
save_strategy="epoch",
logging_steps=4,
output_dir=NEW_MODEL_NAME,
optim="paged_adamw_32bit",
lr_scheduler_type="linear")

In [6]:
#Train of the model

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "conversations",
    args = args
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [4]:
#Generate local model

model_checkpoint = "./"+NEW_MODEL_NAME+"/checkpoint-24"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint,
                                             trust_remote_code=True,
                                             torch_dtype="auto")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:31<00:00,  7.94s/it]


In [5]:
#Test of the model

prompt = "What is Analytic Psychology?"

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, 
    max_length=150, 
    min_length=50, 
    repetition_penalty=2.0, 
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_beams=5,
    early_stopping=True,
    temperature=0.7
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


What is Analytic Psychology? According to The Project Gutenberg eBook of Psychology of the Unconscious : It is a branch of psychology which investigates the relations of the individual  to himself and to the world from the standpoint of the processes of  transference and introversion. According to The Project Gutenberg eBook of Psychology of the Unconscious : The theory of analytic psychology is based upon the  assumption that the libido is the motive power of the individual,  and that through the mechanism of transference and introversion it is  possible for the individual to become conscious of himself and his relation  to the world. According to The Project Gutenberg eBook of Psych
