In [1]:
!pip install -q tiktoken --progress-bar off

In [2]:
import pandas as pd
import numpy as np
import tiktoken # for token counting
from collections import defaultdict

In [3]:
# ESEMPIO DI FORMATTAZIONE DEI TESTI PER IL FINE-TUNING DI GPT
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
# {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}

In [4]:
system_content = """You are a text classifier."""

In [5]:
train_df = pd.read_excel("../data/input/sdg_17_labels_classification_dataset_4760_texts_TRAIN_2023.12.11.xlsx")[['text', 'sdg']].rename(columns={'sdg': 'label'})
val_df = pd.read_excel("../data/input/sdg_17_labels_classification_dataset_1020_texts_DEV_2023.12.11.xlsx")[['text', 'sdg']].rename(columns={'sdg': 'label'})

In [6]:
train_df.head(3)

Unnamed: 0,text,label
0,This Vitamin Reduces Mental Health Problems By...,0
1,'League Of Legends' unveils new Arena game mod...,0
2,Community remembers Maddi Kingsbury at public ...,0


In [7]:
val_df.head(3)

Unnamed: 0,text,label
0,"Murray revealed on Sky Sports, “I hope not, bu...",0
1,"With age and inactive lifestyle, your joints a...",0
2,(Reuters) -Home health and hospice caregiver A...,0


In [8]:
user_content_part1 = """In the following lines, a text enclosed in triple quotes will be given to you.

The task is to classify the text according to the Sustainable Development Goals (SDGs) adopted by all United Nations Member States in 2015.

The constraint to adhere to is that for the classification, you must select only one out of all the possible SDGs.

If you believe that the text to be processed cannot be classified under any of the possible SDGs, then classify it as "---SDG 0---".

Answer with the label only using the format: "---LABEL---"

The input text is:
'''
"""

In [9]:
train_dataset = []
for ind, row in train_df.iterrows():
    train_dataset.append(
        {"messages": [{"role": "system", "content": system_content}, 
                      {"role": "user", "content": user_content_part1 + row['text'] + "\n'''"}, 
                      {"role": "assistant", "content": '"---SDG '+str(row['label'])+'---"'}
                     ]
        }
    )

In [10]:
val_dataset = []
for ind, row in val_df.iterrows():
    val_dataset.append(
        {"messages": [{"role": "system", "content": system_content}, 
                      {"role": "user", "content": user_content_part1 + row['text'] + "\n'''"}, 
                      {"role": "assistant", "content": '"SDG '+str(row['label'])+'"'}
                     ]
        }
    )

In [11]:
train_dataset[0]

{'messages': [{'role': 'system', 'content': 'You are a text classifier.'},
  {'role': 'user',
   'content': 'In the following lines, a text enclosed in triple quotes will be given to you.\n\nThe task is to classify the text according to the Sustainable Development Goals (SDGs) adopted by all United Nations Member States in 2015.\n\nThe constraint to adhere to is that for the classification, you must select only one out of all the possible SDGs.\n\nIf you believe that the text to be processed cannot be classified under any of the possible SDGs, then classify it as "---SDG 0---".\n\nAnswer with the label only using the format: "---LABEL---"\n\nThe input text is:\n\'\'\'\nThis Vitamin Reduces Mental Health Problems By 50%  PsyBlog.\n\'\'\''},
  {'role': 'assistant', 'content': '"---SDG 0---"'}]}

In [12]:
val_dataset[0]

{'messages': [{'role': 'system', 'content': 'You are a text classifier.'},
  {'role': 'user',
   'content': 'In the following lines, a text enclosed in triple quotes will be given to you.\n\nThe task is to classify the text according to the Sustainable Development Goals (SDGs) adopted by all United Nations Member States in 2015.\n\nThe constraint to adhere to is that for the classification, you must select only one out of all the possible SDGs.\n\nIf you believe that the text to be processed cannot be classified under any of the possible SDGs, then classify it as "---SDG 0---".\n\nAnswer with the label only using the format: "---LABEL---"\n\nThe input text is:\n\'\'\'\nMurray revealed on Sky Sports, “I hope not, but you never know  It\'s why athletes need to make the most of it while they\'re still able to because if I was to have another big injury or if something happened to the metal hip, that would be me finished  I wouldn\'t try to come back from another operation or major surgery

In [13]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [14]:
dataset = train_dataset

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [15]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

#n_too_long = sum(l > 4096 for l in convo_lens)
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 160, 524
mean / median: 265.91512605042016, 259.0
p5 / p95: 220.0, 323.10000000000036

#### Distribution of num_assistant_tokens_per_example:
min / max: 8, 8
mean / median: 8.0, 8.0
p5 / p95: 8.0, 8.0

0 examples may be over the 16385 token limit, they will be truncated during fine-tuning


In [16]:
# Pricing and default n_epochs estimate
#MAX_TOKENS_PER_EXAMPLE = 4096
MAX_TOKENS_PER_EXAMPLE = 16385


TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~1265756 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~3797268 tokens


In [17]:
dataset = val_dataset

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [18]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

#n_too_long = sum(l > 4096 for l in convo_lens)
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16385 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 162, 419
mean / median: 264.3921568627451, 257.0
p5 / p95: 219.9, 321.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 6, 6
mean / median: 6.0, 6.0
p5 / p95: 6.0, 6.0

0 examples may be over the 16385 token limit, they will be truncated during fine-tuning


In [19]:
import json

# Apri il file .jsonl in modalità scrittura
with open('Mixtral-7Bx8_Fine_Tuning_SDG_TRAIN_data.jsonl', "w") as file_jsonl:
    # Scrivi ogni dizionario come una riga nel file .jsonl
    for dizionario in train_dataset:
        json.dump(dizionario, file_jsonl)  # Scrivi il dizionario come JSON
        file_jsonl.write('\n')  # Aggiungi un nuovo carattere di nuova riga alla fine

In [20]:
import json

# Apri il file .jsonl in modalità scrittura
with open('Mixtral-7Bx8_Fine_Tuning_SDG_VAL_data.jsonl', "w") as file_jsonl:
    # Scrivi ogni dizionario come una riga nel file .jsonl
    for dizionario in val_dataset:
        json.dump(dizionario, file_jsonl)  # Scrivi il dizionario come JSON
        file_jsonl.write('\n')  # Aggiungi un nuovo carattere di nuova riga alla fine