In [1]:
!pip install -q openai tqdm gradio

[0m

In [2]:
from tqdm import tqdm

In [5]:
import os
import json
import pandas as pd

# Directory paths for database, results and scoring program
DB_ID = 'mimic_iv'
BASE_DATA_DIR = 'data/mimic_iv'
RESULT_DIR = 'sample_result_submission/'
SCORE_PROGRAM_DIR = 'scoring_program/'

# File paths for the dataset and labels
TABLES_PATH = os.path.join('data', DB_ID, 'tables.json')               # JSON containing database schema
TRAIN_DATA_PATH = os.path.join(BASE_DATA_DIR, 'train', 'data.json')    # JSON file with natural language questions for training data
TRAIN_LABEL_PATH = os.path.join(BASE_DATA_DIR, 'train', 'label.json')  # JSON file with corresponding SQL queries for training data
TRAIN_ANS_PATH = os.path.join(BASE_DATA_DIR, 'train', 'answer.json')   # JSON file with corresponding answers for training data
VALID_DATA_PATH = os.path.join(BASE_DATA_DIR, 'valid', 'data.json')    # JSON file for validation data
DB_PATH = os.path.join('data', DB_ID, f'{DB_ID}.sqlite')               # Database path

In [6]:
from utils.data_io import read_json as read_data
train_data = read_data(TRAIN_DATA_PATH)
train_label = read_data(TRAIN_LABEL_PATH)
valid_data = read_data(VALID_DATA_PATH)

In [7]:
from tqdm import tqdm

messages = []
system_msg = "You are 'SQLgpt', an advanced AI designed to convert user questions into accurate SQL queries. Your goal is to generate SQL queries that accurately represent the user's intent while strictly adhering to standard SQL format guidelines. Remember, generated SQL queries must not contain quotes, neither single (' ') nor double (\"). The integrity and accuracy of your responses are critical. In situations of uncertainty, insufficient context, or when a query might exceed the data available, opting for 'null' instead of generating a potentially incorrect SQL is imperative. Your judgment is crucial in preventing errors and ensuring accuracy. Always choose 'null' in doubtful situations to avoid generating inaccurate SQL. Additionally, be aware of the database schema and ensure your SQL queries do not go beyond the schema's scope or generate incorrect SQL based on the schema's limitations."

# Train using only answerable data
for item in tqdm(train_data['data']):
    question = item['question']
    id = item['id']    
    sql = train_label.get(item['id'])  
    if sql == 'null':
        continue
    message = {
        "messages": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": question},
            {"role": "assistant", "content": sql}
        ]
    }
    messages.append(message)
print(len(messages)) 


100%|███████████████████████████████████| 5124/5124 [00:00<00:00, 532128.69it/s]

4674





In [9]:
# Save answerable data as jsonl
import json
with open("messages.jsonl","w",encoding="utf-8") as f:
    for entry in messages:
        json.dump(entry, f, ensure_ascii=False)
        f.write('\n')

In [8]:
!pip install -U openai

Collecting openai
  Downloading openai-1.14.3-py3-none-any.whl.metadata (20 kB)
Downloading openai-1.14.3-py3-none-any.whl (262 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.14.2
    Uninstalling openai-1.14.2:
      Successfully uninstalled openai-1.14.2
Successfully installed openai-1.14.3


In [22]:
import os
import openai
from openai import OpenAI

api = '<your open ai api key>'
client = OpenAI(api_key=api)

fileobject=client.files.create(
  file=open("messages.jsonl", "rb"),
  purpose="fine-tune"
)


In [24]:
fileobject.id

'file-TYIKAba68W7Mek001qh09RHZ'

In [13]:
client.fine_tuning.jobs.create(
  training_file=fileobject.id, 
  model="gpt-3.5-turbo-0125"
)

FineTuningJob(id='ftjob-AEHSAtdseYR77IypPNyPhYNh', created_at=1711258699, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-Kkl8Op0iN1TFvRxmHoLA5USU', result_files=[], status='validating_files', trained_tokens=None, training_file='file-bIoOr63UoWCrfzQ4u2nxiizi', validation_file=None, user_provided_suffix=None)

In [2]:
client.fine_tuning.jobs.list(limit=10)
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-AEHSAtdseYR77IypPNyPhYNh") 

FineTuningJob(id='ftjob-AEHSAtdseYR77IypPNyPhYNh', created_at=1711258699, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::96BuBjlL', finished_at=1711262758, hyperparameters=Hyperparameters(n_epochs=3, batch_size=9, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-Kkl8Op0iN1TFvRxmHoLA5USU', result_files=['file-p7MFAu7ZOdqnYKsUIx2OQCJk'], status='succeeded', trained_tokens=2908665, training_file='file-bIoOr63UoWCrfzQ4u2nxiizi', validation_file=None, user_provided_suffix=None)