# Fine-tuning di modelli su OpenAI

In [3]:
import os
from langchain_openai import ChatOpenAI                                 # pip install langchain-openai
import pandas as pd
from langchain.schema import HumanMessage, SystemMessage, AIMessage     # pip install langchain
import tiktoken
import numpy as np
from collections import defaultdict
import json

In [9]:
# caricamento dati
data = pd.read_excel("data/fine-tuning dataset.xlsx")                   # pip install openpyxl
data.head(15)

Unnamed: 0,User,Prompt
0,forniscimi il prompt per generare un'immagine ...,You are a professional product photographer an...
1,forniscimi il prompt per generare un immagine ...,You are a professional photographer specializi...
2,Forniscimi il prompt per generare un'immagine ...,Please use the attached image as the base and ...
3,Forniscimi il prompt per generare un'immagine ...,create an extremely unremarkable iPhone selfie...
4,forniscimi il prompt per generare un’immagine ...,create a corporate headshot of the person in t...
5,crea una versione sorridente della persona in ...,create a professional business portrait of the...
6,rendimi la persona in allegato più giovane e d...,create a vibrant and youthful corporate portra...
7,trasformami la persona in allegato in un manag...,create a high-end executive portrait of the pe...
8,adatta la foto in allegato per un articolo su ...,create a magazine-quality financial industry p...
9,fammi la persona allegata in versione CEO visi...,create a visionary CEO portrait of the person ...


In [10]:
# stampa separatamente i valori di User e di Prompt del dataset "data" alla riga 10
print("User:", data.iloc[9]['User'])
print("Prompt:", data.iloc[9]['Prompt'])

User: fammi la persona allegata in versione CEO visionario
Prompt: create a visionary CEO portrait of the person in the provided image, strong confident gaze slightly off-camera, contemporary tailored suit, standing in a minimalistic high-rise office with floor-to-ceiling windows and city view, golden hour lighting creating a subtle halo, 16:9 wide crop, Leica SL2, 50mm f/1.4 lens, warm and inspiring color tones


In [13]:
test = []
for idx, row in data.iterrows():
    test.append({
        "messages": [{
            "role": "system",
            "content": "#phtrlsm#"
        }, {
            "role": "user",
            "content": row.values[0]
        }, {
            "role": "assistant",
            "content": row.values[1] 
        }]
    })

In [14]:
print(test)

[{'messages': [{'role': 'system', 'content': '#phtrlsm#'}, {'role': 'user', 'content': "forniscimi il prompt per generare un'immagine professionale per Pepsi"}, {'role': 'assistant', 'content': 'You are a professional product photographer and digital image creator. Your task is to generate high-quality, visually engaging product images of Pepsi for use in marketing, advertising, and social media.\n\nEach image should follow these core principles:\n\n1. Product Focus:\n\nCreate realistic and detailed images featuring a Pepsi can or bottle. Ensure the Pepsi logo and design elements are prominently displayed, clear, and sharp.\n\n2. Background & Setting Variations:\n\nClean white studio background with soft shadows\n\n3. Lighting Setup:\n\nUse realistic studio lighting with soft key lights and subtle backlighting\nAdd highlights and reflections on metallic surfaces to enhance realism\nMaintain correct light falloff and cast soft shadows on the background\n\n4. Special Effects (optional bu

In [15]:
# creazione di un file JSONL partendo dal nostro file Exce
with open("data/phtrlsm.jsonl", 'w') as f:
    for item in test:
        f.write(json.dumps(item) + "\n")

In [16]:
# recupero file JSONL
data_path = "data/phtrlsm.jsonl"

# caricamento dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# stampa di statistiche elementari sul dataset
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 44
First example:
{'role': 'system', 'content': '#phtrlsm#'}
{'role': 'user', 'content': "forniscimi il prompt per generare un'immagine professionale per Pepsi"}
{'role': 'assistant', 'content': 'You are a professional product photographer and digital image creator. Your task is to generate high-quality, visually engaging product images of Pepsi for use in marketing, advertising, and social media.\n\nEach image should follow these core principles:\n\n1. Product Focus:\n\nCreate realistic and detailed images featuring a Pepsi can or bottle. Ensure the Pepsi logo and design elements are prominently displayed, clear, and sharp.\n\n2. Background & Setting Variations:\n\nClean white studio background with soft shadows\n\n3. Lighting Setup:\n\nUse realistic studio lighting with soft key lights and subtle backlighting\nAdd highlights and reflections on metallic surfaces to enhance realism\nMaintain correct light falloff and cast soft shadows on the background\n\n4. Special Effec

In [17]:
# check conformità / errori
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [22]:
# funzioni per il conteggio dei token e la creazione di statistiche sul dataset
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    '''
    Durante il fine-tuning, spesso vengono utilizzati token speciali per indicare l'inizio `<|startoftext|>` 
    e la fine `<|endoftext|>` di una sequenza. Questi token aiutano il modello a comprendere meglio 
    i limiti dei dati di addestramento.
    
    In questo metodo `tokens_per_message` serve per considerare i 3 token utilizzati nel fine-tuning
    per i tag di inizio e fine esempio.
    '''
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message  # Ogni messaggio ha un overhead fisso di tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += tokens_per_message
    return num_tokens


def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens


def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


In [23]:
MAX_TOKENS_PER_EXAMPLE = 4096

n_missing_system = 0
n_missing_user = 0
n_messages = []
tot_tokens_per_example = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    tot_tokens_per_example.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(tot_tokens_per_example, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

n_too_long = sum(l > MAX_TOKENS_PER_EXAMPLE for l in tot_tokens_per_example)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 107, 321
mean / median: 134.47727272727272, 123.0
p5 / p95: 109.3, 147.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 63, 284
mean / median: 93.56818181818181, 84.5
p5 / p95: 71.3, 103.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [20]:
# valori di default suggeriti da OpenAI
TARGET_EPOCHS = 15
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in tot_tokens_per_example)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~5917 tokens that will be charged for during training
By default, you'll train for 15 epochs on this dataset
By default, you'll be charged for ~88755 tokens


# Modello Fine-Tuned

In [None]:
# importa dotenv e carica la chiave OpenAI
from dotenv import load_dotenv # pip install python-dotenv
load_dotenv()


In [37]:
photorealism_model = ChatOpenAI(
    model="ft:gpt-3.5-turbo-1106:personal:photorealism:BqgCFSkM",
    temperature=0.,
    openai_api_key=os.getenv("openai_api_key"),
    max_tokens=1500,
    request_timeout=35
)

In [38]:
photorealism_model.invoke([
    SystemMessage(
        content="#phtrlsm#"),
    HumanMessage(
        content="rendimi la persona in allegato un pugile professionista anni '70"),
])

AIMessage(content='create a 1970s professional boxer portrait from the provided photo, with a gritty, grainy film look, wearing a worn-out Everlast boxing robe, sweat and bruises visible on the face, a focused and determined expression, in a dimly lit gym, 35mm film simulation, Canon AE-1 lens simulation, 3:2 crop', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 71, 'prompt_tokens': 32, 'total_tokens': 103, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'ft:gpt-3.5-turbo-1106:personal:photorealism:BqgCFSkM', 'system_fingerprint': None, 'id': 'chatcmpl-BqgsSmz4vB1OTpaKOCbxgLae8U4Nz', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--a7b13cc1-7deb-46d4-ab64-9554fa8536a1-0', usage_metadata={'input_tokens': 32, 'output_tokens': 71, 'total_tokens

In [39]:
photorealism_model.invoke([
    SystemMessage(
        content="#phtrlsm#"),
    HumanMessage(
        content="rendimi la persona in allegato un dj anni '90"),
])

AIMessage(content='create a 90s DJ persona from the provided image, wearing a colorful windbreaker, baggy jeans, and chunky sneakers, with a backwards cap and small round sunglasses, holding a vinyl record and a pair of Technics SL-1200 turntables, in a dimly lit club booth with neon lights and a haze of cigarette smoke, confident and focused expression, 90s-style film grain and color palette', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 84, 'prompt_tokens': 28, 'total_tokens': 112, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'ft:gpt-3.5-turbo-1106:personal:photorealism:BqgCFSkM', 'system_fingerprint': None, 'id': 'chatcmpl-BqhATQsQX7oiBy1HJGJjanauFCDhU', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--9114eb93-9aea-4486-8708-3670555

In [40]:
photorealism_model.invoke([
    SystemMessage(
        content="#phtrlsm#"),
    HumanMessage(
        content="Forniscimi il prompt per generare un'immagine di un selfie della persona in allegato a a Napoli durante la festa scudetto"),
])


AIMessage(content="create a realistic selfie of the person in the attached photo, celebrating Napoli's Serie A victory, wearing a blue Napoli jersey, in a crowded street with flags and confetti, holding a smartphone at arm's length, genuine smile, warm sunlight, Vesuvius in the background, 3:4 portrait orientation", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 62, 'prompt_tokens': 47, 'total_tokens': 109, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'ft:gpt-3.5-turbo-1106:personal:photorealism:BqgCFSkM', 'system_fingerprint': None, 'id': 'chatcmpl-BqhFOGNjZW559RHlLMefy8IXaULKd', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--8e1f5698-6d58-4422-bd9e-22ed833a5833-0', usage_metadata={'input_tokens': 47, 'output_tokens': 62, 'total_tokens