In [2]:
import os
from pymongo import MongoClient
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from collections import Counter
import requests
import re
import openai
import json
import tiktoken 
import numpy as np
from collections import defaultdict
import chime




Load API Key and Connect to MongoDB

In [3]:
load_dotenv()
api_key  = os.getenv("API")
client = OpenAI(api_key = api_key)
client = MongoClient('mongodb://localhost:27017/')
db = client['New']
collection = db['New']

Initialize Base Prompt, Disorder List and System Settings for Fine Tuning

In [8]:
base_prompt = """
Classify a person's psychological disorders based on text and label using these labels : [ADHD,Autism,  OCD, PTSD, Eating Disorder, Bipolar , Schizophrenia]
. What would you choose, only 1 label should be returned as output, no explanation needed. Here's an example of how the output must look like 
Text : "@USER @USER Words that reinforce &amp; express a system of social &amp; economic oppression cause actual harm. It's just not immediate and visible, so privileged people like to say it's not the same. Courts recognize that emotional abuse is real; this is on a societal scale."
Labels : ADHD,Autism,  OCD, PTSD, Eating Disorder, Bipolar , Schizophrenia
This is the text : 
"""

system_setting = "You're a classifcation bot for psychological disorders, your task is to accurately predict the psychological disorder of a user based on text. "
disorder_list  = ["ADHD", "AUTISM", "BIPOLAR", "EATING DISORDER", "OCD", "PTSD", "SCHIZOPHRENIA"]

Function for creating the right json output

In [5]:
import json

def create_chat_output(*args):
    messages = []
    roles = ["system", "user", "assistant"]
    for i, content in enumerate(args):
        role = roles[i % 3]
        message = {"role": role, "content": content}
        messages.append(message)
    output = {"messages": messages}
    return output


In [6]:
create_chat_output("You are Siri a personal assistant", "Who are you?", "I am Siri")

{'messages': [{'role': 'system',
   'content': 'You are Siri a personal assistant'},
  {'role': 'user', 'content': 'Who are you?'},
  {'role': 'assistant', 'content': 'I am Siri'}]}

Function for querying the DB

In [12]:
def search_db(keyword, limit):
    pipeline = [
    {
        '$match': {
            'class': keyword  # Filter on the class
        }
    },
    {
        '$group': {
            '_id': '$user_id',  # Group by the user ID
            'tweet': {'$push': '$tweet'}  # Collect the tweets for each user
        }
    },
    {
        '$project': {
            'sampled_tweets': {'$slice': ['$tweet', 45]}  # Select 45 entries without replacement
        }
    },
    {
        '$limit': limit  # Limit the total number of entries to 3
    }
]
    result = db.New.aggregate(pipeline) # Contains all the grouped tweets by user, aggregation is already done in this part
    return result


Function for creating the correct JSON output needed for fine tuning:

PS : This will return the finetune_input.jsonl

PS : Running this may take a while, due to database querying.

In [13]:
import json

def fine_tuning_pipeline(disorder_list: list[str], limit: int, output_file: str):
    combined_json = []
    for disorder in disorder_list:
        resulter = search_db(disorder, limit)
        
        for document in resulter:
            json_obj = create_chat_output(system_setting, base_prompt + str(document["sampled_tweets"]), disorder)
            combined_json.append(json_obj)

    with open(output_file, 'w') as file:
        for obj in combined_json:
            json.dump(obj, file)
            file.write('\n')  # Add a newline to separate each JSON object

# Example usage
fine_tuning_pipeline(disorder_list, 5, 'finetune_input.jsonl')

Loading the dataset:

In [16]:
data_path = "finetune_input.jsonl"
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]
    print("Worked")
    chime.theme('material')
    chime.success()

Following we will do a quick unit test based on : 
https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [19]:
from collections import defaultdict

def check_format_errors(dataset):
    format_errors = defaultdict(int)

    for ex in dataset:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1

            if any(k not in ("role", "content", "name", "function_call") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print("Found errors:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
    else:
        print("No errors found")

check_format_errors(dataset)


No errors found


In [21]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [22]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 746, 2479
mean / median: 1384.9714285714285, 1326.0
p5 / p95: 972.6, 1930.6000000000004

#### Distribution of num_assistant_tokens_per_example:
min / max: 2, 7
mean / median: 3.2857142857142856, 2.0
p5 / p95: 2.0, 7.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [23]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~48474 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~145422 tokens


This Creates a fine tuning Job in GPT:

In [74]:
from openai import OpenAI
client = OpenAI(api_key = api_key)

client.files.create(
  file=open("finetune_input.jsonl", "rb"),
  purpose="fine-tune"
)


client.fine_tuning.jobs.create(
  training_file="file-QfnGpIVcVvhu9AwALilahjix", 
  model="gpt-3.5-turbo"
)


FileObject(id='file-QfnGpIVcVvhu9AwALilahjix', bytes=207059, created_at=1702063468, filename='output.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-SQTBi5bvEp6uLTjwxpbS57pc")


Test the model : 

In [None]:
response = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0613:nlptn::8Tas44EK",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hello!"}
  ]
)
print(response.choices[0].message)
