<a href="https://colab.research.google.com/github/youkiti/ARE/blob/main/2023_11_2_Azure_meta_prompt_development_serial_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Prepare datasets

In [None]:
!pip install -q openai cohere tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import openai
import json
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm
import pandas as pd
from datetime import datetime
from sklearn.metrics import confusion_matrix



#Please apply your own settings to the following cell

*   If you dont know the path please click the left folder button.
image.png


*   Then click the drive folder and seek.

In [None]:
API_KEY = "YOUR OWN AZURE OPENAI API KEY"
RESOURCE_ENDPOINT = "YOUR OWN ENDPOINT"

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = '2023-07-01-preview'

#PLEASE CHANGE DEPENDING YOUR OWN DEPLOYS
deployment_ids = ["gpt-35-turbo-0613","gpt-35-turbo-16k-0613","gpt4-0613","gpt-4-32k-0613","text-embedding-ada-002"]

folderPath = "Path to save experiments"

# meta-prompt modification cut-offs
sensitivity_cutoff = 0.9
specificity_cutoff = 0.5
error_proportion_cutoff = 0.05

In [None]:
# change directory
os.chdir(folderPath)

# make a folder to save experiments
os.makedirs("experiments", exist_ok=True)

# read Excel files with "reference standards" for development of meta-prompts
df_train100 = pd.read_excel('https://github.com/youkiti/ARE/raw/main/datasets/dataset1.xlsx',engine='openpyxl')

In [None]:
df_train100.head()

In [None]:
#first prompt
prompt = ("You are a systematic reviewer. Please judge diagnostic test accuracy (DTA) studies from inputted abstracts. "
         "The definition of a DTA study is an original study that evaluated a test against a clinical reference standard for humans."
         "You can accept multivariable diagnostic prediction model studies, but exclude prognostic prediction model studies, that measured predictors and outcomes at different time points."
         "Exclude modeling studies, studies that assessed diagnostic training for medical professionals, and case series (e.g., studies without controls, such as following polymerase chain reaction results of specific patients)."
         "The abstract is as follows:")

In [None]:
# Processing of abstracts
def process_abstracts(abstract, prompt, model_name, temperature):
    # Construct the prompt
    question = prompt + str(abstract)

    # Define functions for processing
    functions = [
        {
            "name": "dta_filter",
            "description": "Classify diagnostic test accuracy abstracts",
            "parameters": {
                "type": "object",
                "properties": {
                    "judgement": {
                        "type": "string",
                        "description": "Determines if the abstract is a diagnostic test accuracy study abstract. Returns 'True' or 'False'."
                    },
                    "probability": {
                        "type": "string",
                        "description": "The probability (from 0 to 1) that the abstract is a diagnostic test accuracy study abstract."
                    }
                }
            }
        }
    ]

    # Call Azure API
    response = openai.ChatCompletion.create(
        deployment_id=model_name,
        messages=[{"role": "user", "content": question}],
        functions=functions,
        function_call="auto",
        temperature=temperature
    )

    # Parse the result in JSON format
    response_json = json.loads(response["choices"][0]["message"]["function_call"]["arguments"])
    return response_json

# Process DataFrame
def process_abstracts_to_dataframe(df_train, prompt, model_name, temperature):
    reviews = []
    errors = 0
    MAX_RETRIES = 3  # Max retries, set to 3 for practical reasons

    for index, tiab in tqdm(enumerate(df_train["tiab"])):
        response_json = {}  # Initialize response_json
        retries = 0
        success = False
        while retries < MAX_RETRIES and not success:
            try:
                response_json = process_abstracts(abstract=tiab, prompt=prompt, model_name=model_name, temperature=temperature)
                # Confirm that the judgement value is True or False
                if response_json['judgement'] not in ["True", "False"]:
                    raise ValueError("Judgement value is not True or False")

                # Add index information to response_json
                response_json['index'] = index
                reviews.append(response_json)
                success = True

            except Exception as e:
                print(f"Error occurred: {e}. Retrying {retries + 1}/{MAX_RETRIES}")
                retries += 1
                time.sleep(1)  # Wait for 1 second

        if retries == MAX_RETRIES:
            print(f"Failed to process abstract after {MAX_RETRIES} retries.")
            response_json['index'] = index
            reviews.append(response_json)
            errors += 1

    print(f"Total errors occurred: {errors}")

    # Define column name based on current date and time
    now = datetime.now()
    date_str = now.strftime('%Y-%m-%d')
    time_str = now.strftime('%H-%M')
    datetime_str = date_str + "_" + time_str
    column_name = datetime_str + "_Azure_" + "_" + model_name + "_temp" + str(temperature)

    # List for storing index and review information
    data = [(review['index'], review) for review in reviews]

    # Create DataFrame
    df_reviews = pd.DataFrame(data, columns=['index', column_name])

    # Set index
    df_reviews.set_index('index', inplace=True)

    return df_reviews

# Function to determine judgement value
def judgement_value(cell_value):
    if isinstance(cell_value, dict) and 'judgement' in cell_value:
        judgement = cell_value['judgement']
        if judgement == 'True':
            return 1
        elif judgement == 'False':
            return 0
    return np.nan

# Function to run experiments
def run_experiment(df_train, first_prompt, model_name, folder_path=folderPath, max_iterations=10, temperature=0):
    seido = pd.DataFrame()
    print("model: " + model_name)
    now = datetime.now()
    date_str = now.strftime('%Y-%m-%d')
    time_str = now.strftime('%H-%M')
    datetime_str = date_str + "_" + time_str
    num = df_train.shape[0]

    for iteration in range(max_iterations):
        print(f"iteration: {iteration+1}")

        kekka = process_abstracts_to_dataframe(df_train=df_train, prompt=first_prompt, model_name=model_name, temperature=temperature)
        df_train = pd.concat([df_train, kekka], axis=1)

        shori_columns = [col for col in df_train.columns[5:] if 'temp' in col and 'shori' not in col]

        new_columns = {"shori_" + i: df_train[i].apply(judgement_value) for i in shori_columns}
        df_train = df_train.assign(**new_columns)

        # find "shori" column
        shori_columns = [col for col in df_train.columns if 'shori' in col]

        # right shori
        rightmost_shori_column = shori_columns[-1] if shori_columns else None

        filled_column = df_train[rightmost_shori_column].fillna(1)
        cm = confusion_matrix(df_train["judgement"], filled_column.map({1: 1, 0: 0}))
        print(cm)

        # calculation of sensitivity and specificity
        sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
        specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
        nan_count = df_train[rightmost_shori_column].isna().sum()
        nanproportion = nan_count / df_train.shape[0]

        results_df = pd.DataFrame({
            'model': [datetime_str + "_" + str(df_train.shape[0]) + "_" + model_name + "temp_" + str(temperature)],
            'prompt': [first_prompt],
            'Sensitivity': [sensitivity],
            'Specificity': [specificity],
            'Proportion of NaN values': [nanproportion],
            'TP': cm[1, 1],
            'FN': cm[1, 0],
            'FP': cm[0, 1],
            'TN': cm[0, 0]
        })

        seido = pd.concat([seido, results_df])

        seido.to_excel(folderPath + "/experiments/" + datetime_str + "_" + str(num) + "abstracts_serial_experiments" + model_name + "_azure_seido.xlsx", engine='openpyxl', index=False)

        # save each iteration
        df_train.to_excel(folderPath + "/experiments/" + datetime_str + "_" + str(num) + "abstracts_serial_experiments" + model_name + "_azure.xlsx", engine='openpyxl', index=False)

        # meta-prompt modification you can change your cut-offs
        if sensitivity < sensitivity_cutoff or specificity < specificity_cutoff or nanproportion > error_proportion_cutoff:
            print(f"Iteration {iteration + 1}: Sensitivity was {sensitivity}. Specificity was {specificity}. Error was {nanproportion}. Retrying...")

            # change meta-prompt
            response = openai.ChatCompletion.create(
                deployment_id=model_name,
                messages=[
                    {"role": "system", "content": "Please become my prompt engineer. Your goal is to help me create the best prompts for systematic review of diagnostic test accuracy. The prompts will be used by you, ChatGPT. Please rewrite inputted metaprompt to achieve sensitivity > 0.9 and specificity > 0.4 and error proportion < 0.1."},
                    {"role": "user", "content": f"The sensitivity was {sensitivity}. The specificity was {specificity}. The proportion of errors was {nanproportion}. The prompt was {first_prompt}."}
                ],
                temperature=0.6
            )
            first_prompt = response.choices[0].message.content
            print(first_prompt)
            continue

        else:
            print(f"Iteration {iteration + 1}: Sensitivity is greater or equal to 0.9. Stopping...")
            break
    else:
        print(f"Reached {max_iterations} iterations without reaching the desired sensitivity. Stopping...")

#execute

In [None]:
run_experiment(df_train=df_train100, first_prompt=prompt, model_name=deployment_ids[0], max_iterations= 10,temperature = 0)

Please check your results in your own google drive folder