In [None]:
! pip install openai json requests os tiktoken time
from tqdm import tqdm
import pandas as pd
import json

In [None]:
### IMPORT DATA
labels = pd.read_csv("train_drug_label_text_remove_unnecessary_info.csv")
manual_ades = pd.read_csv('train_drug_label_text_manual_ades.csv')
subsection_results = pd.read_csv("subsection_results.csv")

In [None]:
# 10 drugs with lowest recall 
subsection_results = subsection_results.sort_values('recall')
bottom = subsection_results[:10].drug_name.to_list()
bottom_ades = []
for drug in tqdm(bottom):
    drug_df = manual_ades[manual_ades['drug_name'] == drug]
    manual = set(drug_df['reaction_string'].str.lower())
    manual = list(manual)
    bottom_ades.append(manual)
bottom

In [None]:
results[results['drug_name'].isin(bottom)].sort_values('drug_name')

In [None]:
# Create dataframe of 10 drugs with lowest recall 

finetune = []
for i in range(len(bottom)):
    finetune.append([bottom[i], bottom_ades[i]])
    # finetune.append([bottom, bottom_ades])
finetune = pd.DataFrame(finetune, columns = ["drug_name", "ades"])
finetune.head(1)

In [None]:
# Function to concatenate section_text for specific drugs

def concatenate_texts(df, drug_list):
    # Filter the DataFrame for the drugs in the list
    filtered_df = df[df['drug_name'].isin(drug_list)]

    # Group by drug_name and concatenate section_text
    concatenated_df = filtered_df.groupby('drug_name')['section_text'].apply(' '.join).reset_index()

    return concatenated_df

# Using the function
result_df = concatenate_texts(labels, bottom)
result_df.head(1)

In [None]:
def generate_json(labels, result_df, finetune):
    lst = []
    for drug in labels:
        # Get user content and convert to string if it's a list
        user_content = result_df[result_df['drug_name'] == drug]['section_text'].iloc[0] if not result_df[result_df['drug_name'] == drug].empty else ""
        if isinstance(user_content, list):
            user_content = ' '.join(user_content)  # or use str(user_content) for list-like string

        # Get assistant content and convert to string if it's a list
        assistant_content = finetune[finetune['drug_name'] == drug]['ades'].iloc[0] if not finetune[finetune['drug_name'] == drug].empty else ""
        if isinstance(assistant_content, list):
            assistant_content = ' '.join(assistant_content)  # or use str(assistant_content) for list-like string

        lst.append({"messages": [{"role": "system", "content": "You are an expert in pharmacology."},
                                 {"role": "user", "content": user_content},
                                 {"role": "assistant", "content": assistant_content}]})
    return lst

# Generate the list of dictionaries
json_list = generate_json(bottom, result_df, finetune)

# Output to a JSONL file
with open('finetuned_bottom_data.jsonl', 'w') as file:
    for entry in json_list:
        json.dump(entry, file)
        file.write('\n')


In [None]:
# Load the training set
with open('finetuned_bottom_data.jsonl', 'r', encoding='utf-8') as f:
    finetuned_data = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in finetuned set:", len(finetuned_data))
print("First example in finetuned set:")
for message in finetuned_data[5]["messages"]:
    print(message)

In [None]:
# ! pip install git+https://github.com/openai/whisper.git

In [None]:
# Check number of tokens 

import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            # if isinstance(value, list):
            #     value = str(value)
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            # if isinstance(message["content"], list):
            #     message["content"] = str(message["content"])
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['finetuned_bottom_data.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))
    
    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

In [None]:
# Upload fine-tuning files
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  # azure_endpoint = os.getenv("https://onsides-gpt-finetuning.openai.azure.com/"), 
  # api_key=os.getenv("1cc983d783784fcf9564848ea6cb7cc4"),  
  azure_endpoint = "https://onsides-gpt-finetuning.openai.azure.com/",
  api_key= API_KEY, 
  api_version="2023-12-01-preview"  # This API version or later is required to access fine-tuning for turbo/babbage-002/davinci-002
)

training_file_name = 'finetuned_bottom_data.jsonl'

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

print("Training file ID:", training_file_id)

In [None]:
# Initiate fine-tune

response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-35-turbo-0613", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters. 
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.id)
print(response.model_dump_json(indent=2))

In [None]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status = response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)
    
    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response.model_dump_json(indent=2))
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

In [None]:
#Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

In [None]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = AZURE_ENDPOINT, 
  api_key= API_KEY,  
  api_version="2023-05-15"
)

response = client.chat.completions.create(
    model="gpt-35-finetuned-bottomlabels", # model = "Custom deployment name you chose for your fine-tuning model"
    messages=[
        {"role": "system", "content": "You are an expert in pharmacology."},
        {"role": "user", "content": "What are the adverse drug events for the drug CHOLINE"}
    ]
)

print(response.choices[0].message.content)