# Data analysis

In [7]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [8]:
data_path = "../data/final_preprocessed_data/discriminator/finetune_train_data_gpt35.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 11884
First example:
{'role': 'user', 'content': 'The following is a diff which describes the code changes in a commit, Your task is to write a short commit message accordingly.\ndiff --git a/vulnerabilities/api.py b/vulnerabilities/api.py @@ -146,7 +146,7 @@ class PackageViewSet(viewsets.ReadOnlyModelViewSet):\n"Error": "A non-empty \'purls\' list of package URLs is required."\n},\n)\n- for purl in request.data.get("purls"):\n+ for purl in request.data["purls"]:\ntry:\npurl = PackageURL.from_string(purl).to_dict()\nexcept ValueError as ve:\n\nAccording to the diff, the commit message should be:'}
{'role': 'assistant', 'content': 'Remove redundant "get" in bulk api code'}


In [9]:
## write to jsonl file for previous 3000 examples
with open("../data/final_preprocessed_data/discriminator/finetune_train_data_gpt35_3000.jsonl", "w") as f:
    for item in dataset[:3000]:
        f.write(json.dumps(item) + "\n")

In [19]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [20]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [22]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 512 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 512 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 11884
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 2, 2
mean / median: 2.0, 2.0
p5 / p95: 2.0, 2.0

#### Distribution of num_total_tokens_per_example:
min / max: 83, 366
mean / median: 163.01624032312353, 161.0
p5 / p95: 135.0, 191.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 2, 22
mean / median: 6.749326825984517, 7.0
p5 / p95: 3.0, 10.0

0 examples may be over the 512 token limit, they will be truncated during fine-tuning


In [23]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 512

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~1937285 tokens that will be charged for during training
By default, you'll train for 2 epochs on this dataset
By default, you'll be charged for ~3874570 tokens


# Finetune

In [13]:
from openai import OpenAI
client = OpenAI(
    api_key="sk-proj-c5dK0p6JtZQTzVdJJxuXT3BlbkFJ9bYsPPvOUbtNqcKJ5R7y",
)

In [10]:
client.files.create(
  file=open("../data/final_preprocessed_data/discriminator/finetune_train_data_gpt35_3000.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-QdsYR7EoyvNIi6wtqm2qrjJZ', bytes=2091055, created_at=1717794459, filename='finetune_train_data_gpt35_3000.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [14]:
client.files.list()

SyncPage[FileObject](data=[FileObject(id='file-QdsYR7EoyvNIi6wtqm2qrjJZ', bytes=2091055, created_at=1717794459, filename='finetune_train_data_gpt35_3000.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-qkCoJIduAkUnAvGRoPJMBrjN', bytes=8280854, created_at=1717794139, filename='finetune_train_data_gpt35.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)], object='list', has_more=False)

In [21]:
client.fine_tuning.jobs.create(
  training_file="file-QdsYR7EoyvNIi6wtqm2qrjJZ", 
  model="gpt-3.5-turbo"
)

FineTuningJob(id='ftjob-9Sepnav1KQbZ0mGQWKWz8ZWI', created_at=1717795169, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=[], status='validating_files', trained_tokens=None, training_file='file-QdsYR7EoyvNIi6wtqm2qrjJZ', validation_file=None, user_provided_suffix=None, seed=1338143889, estimated_finish=None, integrations=[])

In [22]:
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-9Sepnav1KQbZ0mGQWKWz8ZWI', created_at=1717795169, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=[], status='running', trained_tokens=None, training_file='file-QdsYR7EoyvNIi6wtqm2qrjJZ', validation_file=None, user_provided_suffix=None, seed=1338143889, estimated_finish=None, integrations=[]), FineTuningJob(id='ftjob-s9NJoTFCh36p9v9CA450r9Si', created_at=1717794826, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=[], status='cancelled', trained_t

In [23]:
client.fine_tuning.jobs.retrieve("ftjob-9Sepnav1KQbZ0mGQWKWz8ZWI")

FineTuningJob(id='ftjob-9Sepnav1KQbZ0mGQWKWz8ZWI', created_at=1717795169, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=[], status='running', trained_tokens=None, training_file='file-QdsYR7EoyvNIi6wtqm2qrjJZ', validation_file=None, user_provided_suffix=None, seed=1338143889, estimated_finish=None, integrations=[])

In [19]:
client.fine_tuning.jobs.cancel("ftjob-s9NJoTFCh36p9v9CA450r9Si")

FineTuningJob(id='ftjob-s9NJoTFCh36p9v9CA450r9Si', created_at=1717794826, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=[], status='cancelled', trained_tokens=None, training_file='file-QdsYR7EoyvNIi6wtqm2qrjJZ', validation_file=None, user_provided_suffix=None, seed=271616465, estimated_finish=None, integrations=[])

In [24]:
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-9Sepnav1KQbZ0mGQWKWz8ZWI', created_at=1717795169, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::9Xc170ni', finished_at=1717797987, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-vNGdTdFPwXPkFJG0nWlHKFPf', result_files=['file-Xo01iQv5JWJfDOeHeR6f0oMt'], status='succeeded', trained_tokens=1450593, training_file='file-QdsYR7EoyvNIi6wtqm2qrjJZ', validation_file=None, user_provided_suffix=None, seed=1338143889, estimated_finish=None, integrations=[]), FineTuningJob(id='ftjob-s9NJoTFCh36p9v9CA450r9Si', created_at=1717794826, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_i

In [25]:
from tqdm import tqdm
def gpt_35_api(messages: list):

    # completion = client.chat.completions.create(model="gpt-3.5-turbo-0125", messages=messages, temperature=0.5)
    completion = client.chat.completions.create(model="ft:gpt-3.5-turbo-0125:personal::9Xc170ni", messages=messages, temperature=0)
    return completion.choices[0].message.content
from langchain import hub
prompt = hub.pull("tyfann/llm4commit-zeroshot")
with open('../data/final_preprocessed_data/discriminator/dis_eval_data.json', 'r', encoding='UTF-8') as f:
    org_data = json.load(f)

org_data = org_data[:1000]

gpt_msg = []
for index, data in tqdm(enumerate(org_data), total=len(org_data), desc="Processing documents"):
    # merged_diff = '\n'.join(diff['diff'] for diff in data['diff'])
    messages = prompt.invoke(
        {"DIFF": data['diff']}
    ).to_messages()
    example_prompt = [{'role': 'user','content': messages[0].content},]
    try:
        gpt_msg.append(gpt_35_api(example_prompt))
    except:
        print(index)
        gpt_msg.append("")

Processing documents: 100%|██████████| 1000/1000 [08:20<00:00,  2.00it/s]


In [26]:

for item, msg in zip(org_data, gpt_msg):
    item['chatgpt_finetune'] = msg
import os
# output_file = '../data/chronicle/rag_baseline/zeroshot/rag_baseline_python_chatgpt.json'
output_file = '../data/final_preprocessed_data/discriminator/dis_eval_data_chatgpt_finetune.json'
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='UTF-8') as f:
    json.dump(org_data, f, ensure_ascii=False, indent=4)

## Old version finetune

In [11]:
import openai
import json

openai.api_key = "sk-0rLvuRkMiD4Mw25QYygh6rUlZVjpQWNGNF4yez7z3PZ7yCOm"
openai.base_url = "https://api.chatanywhere.tech/v1"

model_engine = "gpt-3.5-turbo-ca"
n_epochs = 3
batch_size = 4
learning_rate = 1e-5
max_tokens = 1024

training_file = "../data/final_preprocessed_data/discriminator/finetune_train_data.jsonl"
validation_file = "../data/final_preprocessed_data/discriminator/finetune_eval_data.jsonl"

In [12]:
import os

# Create the fine-tuning job
fine_tuning_job = openai.FineTune.create(
    model_engine=model_engine,
    n_epochs=n_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    max_tokens=max_tokens,
    training_file=os.path.abspath(training_file),
    validation_file=os.path.abspath(validation_file),
)

job_id = fine_tuning_job["id"]
print(f"Fine-tuning job created with ID: {job_id}")

APIRemovedInV1: 

You tried to access openai.FineTune, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


In [None]:
import time

while True:
    fine_tuning_status = openai.FineTune.get_status(job_id)
    status = fine_tuning_status["status"]
    print(f"Fine-tuning job status: {status}")

    if status in ["completed", "failed"]:
        break

    time.sleep(60)

In [None]:
fine_tuned_model_id = fine_tuning_status["fine_tuned_model_id"]

# Use the fine-tuned model for text generation
def generate_text(prompt, model_id, max_tokens=1024):
    response = openai.Completion.create(
        engine=model_id,
        prompt=prompt,
        max_tokens=max_tokens,
        n=1,
        stop=None,
        temperature=0.5,
    )
    return response.choices[0].text.strip()

In [None]:
test_file = "../data/final_preprocessed_data/discriminator/finetune_test_data.jsonl"
test_data = []
with open(test_file, 'r', encoding='UTF-8') as file:
    for line in file:
        test_data.append(json.loads(line.strip()))

for test_item in test_data:
    test_item['chatgpt_finetune'] = generate_text(test_item['prompt'], fine_tuned_model_id)

output_file_path = '../data/final_preprocessed_data/discriminator/finetune_test_data_chatgpt.jsonl'
# Open the output JSONL file for writing
with open(output_file_path, 'w') as output_file:
    for item in test_data:
        output_file.write(json.dumps(item) + '\n')