In [12]:
import evaluate
from openai import AzureOpenAI
import os
import pandas as pd
import datasets
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import ParameterGrid
import evaluate

client = AzureOpenAI(azure_endpoint='https://oai-cbipm-01.openai.azure.com/',
                     api_key=os.environ["OPENAI_API_KEY"],
                     api_version="2023-12-01-preview")  # 2023-12-01-preview <- highest version number as of 1/12/23

deployment = "Deployment"

In [13]:
# metrics function
def compute_metrics(predictions, references, labels=None, pos_label=1, average="weighted", sample_weight=None, zero_division='warn'):
        f1 = f1_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight
        )
        p = precision_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        r = recall_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        c = classification_report(
            references, predictions, labels=labels
        )
        print(c)
        return {"f1": float(f1) if f1.size == 1 else f1,
                "precision": float(p) if p.size == 1 else p,
                "recall": float(r) if r.size == 1 else r}

In [14]:
train = pd.read_csv("../data/train_MD_labels.csv")
test = pd.read_csv("../data/test_PT_MD_labels.csv")

json_train_sentences = train["language"].to_json()
json_train_labels = train["MD_label"].to_json()

# converting test sentences to json format
json_test_sentences = test["language"].to_json()
messages = [{"role": "system", "content": "You are a medical doctor."},
            {"role": "user", "content": "As a medical doctor, you write many clinical notes about patients.\n"
                                        "Your task is to analyze the sentiment of a series of sentences you wrote about patients.\n"
                                        "For each sentence, what is your attitude towards the patient you wrote about?\n"
                                        "Please assign a sentiment score of negative, neutral, or positive for each sentence.\n"
                                        "Below are three example sentences in JSON format:\n"
                                        f"{json_train_sentences}"
                                        "Please provide your answer in JSON format."},
            {"role": "assistant", "content": f"{json_train_labels}"},
            {"role": "user", "content": f"Please complete the same task with each of these sentences:\n{json_test_sentences}"}]

response = client.chat.completions.create(model=deployment, messages=messages, temperature=0, seed=42)

output = response.choices[0].message.content
# print(output)

predictions = pd.read_json(output, orient="index")
results = compute_metrics(predictions[0], test["MD_label"])
print(results)

error_analysis = pd.concat([test[["idx", "language", "PT_MD_labels", "MD_label"]], predictions], axis=1)
error_analysis = error_analysis.rename(columns={0:"pred", "MD_label":"true"})
mask = error_analysis["pred"] == error_analysis["true"]
error_analysis = error_analysis[~ mask]
error_analysis.to_csv('error_analysis.csv', index=False)

              precision    recall  f1-score   support

    negative       0.60      0.82      0.69        11
     neutral       0.71      0.45      0.56        11
    positive       1.00      1.00      1.00         5

    accuracy                           0.70        27
   macro avg       0.77      0.76      0.75        27
weighted avg       0.72      0.70      0.69        27

{'f1': 0.693573915796138, 'precision': 0.7206349206349206, 'recall': 0.7037037037037037}


  predictions = pd.read_json(output, orient="index")


In [11]:
# the best metric so far uses prompt with 2 neutral sentences. their idx = 0 and 6
#               precision    recall  f1-score   support
#
#     negative       0.60      0.82      0.69        11
#      neutral       0.71      0.45      0.56        11
#     positive       1.00      1.00      1.00         5
#
#     accuracy                           0.70        27
#    macro avg       0.77      0.76      0.75        27
# weighted avg       0.72      0.70      0.69        27
#
# {'f1': 0.693573915796138, 'precision': 0.7206349206349206, 'recall': 0.7037037037037037}