In [1]:
%env OPENAI_API_KEY=#setkeyhere
import evaluate
from openai import AzureOpenAI
import os
import pandas as pd
import datasets
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import ParameterGrid
import evaluate

client = AzureOpenAI(azure_endpoint='https://oai-cbipm-01.openai.azure.com/',
                     api_key=os.environ["OPENAI_API_KEY"],
                     api_version="2023-12-01-preview")  # 2023-12-01-preview <- highest version number as of 1/12/23

deployment = "Deployment"




In [2]:
# metrics function
def compute_metrics(predictions, references, labels=None, pos_label=1, average="weighted", sample_weight=None, zero_division='warn'):
        f1 = f1_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight
        )
        p = precision_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        r = recall_score(
            references, predictions, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight,
            zero_division=zero_division
        )
        c = classification_report(
            references, predictions, labels=labels
        )
        print(c)
        return {"f1": float(f1) if f1.size == 1 else f1,
                "precision": float(p) if p.size == 1 else p,
                "recall": float(r) if r.size == 1 else r}

In [3]:
# training sentence selection function
def train_sentence_selection(df, n_neutral, n_positive, n_negative, label, seed):
    # saving column name given PT or MD label
    label = f"{label}_label"
    # randomly selecting 1-2 sentences per label
    neutral_sentences = df[df[label] == "neutral"].sample(n_neutral, replace=False, random_state=seed)
    positive_sentences = df[df[label] == "positive"].sample(n_positive, replace=False, random_state=seed)
    negative_sentences = df[df[label] == "negative"].sample(n_negative, replace=False, random_state=seed)
    all_sentences = pd.concat([neutral_sentences,
                               positive_sentences,
                               negative_sentences], ignore_index = True)
    return all_sentences

In [8]:
train = pd.read_csv("best_CLI_sentences_90.csv")
test = pd.read_csv("../data/validation_sentences.csv")
json_train_sentences = train["language"].to_json()
json_train_labels = train["PT_label"].to_json()
json_test_sentences = test["language"].to_json()

messages = [{"role": "system", "content": "You are a patient."},
            {"role": "user", "content": "As a patient at a medical center, medical doctors write lots of clinical notes about you.\n"
                                        "Your task is to analyze the sentiment of a series of sentences your doctor wrote about you.\n"
                                        "For each sentence, how do you feel reading this description of you?\n"
                                        "Please assign a sentiment score of negative, neutral, or positive for each sentence.\n"
                                        "Below are some example sentences in JSON format:\n"
                                        f"{json_train_sentences}"
                                        "Please provide your answer in JSON format."},
            {"role": "assistant", "content": f"{json_train_labels}"},
            {"role": "user", "content": f"Complete the same task with each of these sentences:\n{json_test_sentences}"}]

response = client.chat.completions.create(model=deployment, messages=messages, temperature=0, seed=42)

output = response.choices[0].message.content
print(output)
predictions = pd.read_json(output, orient="index")
# results = compute_metrics(predictions[0], test["PT_label"])
# print(results)
# error_analysis = pd.concat([test[["language", "MD_PT_label", "PT_label"]], predictions], axis=1)
# error_analysis = error_analysis.rename(columns={0:"pred", "PT_label":"true"})
# mask = error_analysis["pred"] == error_analysis["true"]
# error_analysis = error_analysis[~ mask]
# error_analysis.to_csv('error_analysis_validation_NA.csv', index=False)
print(predictions[0].to_string(index=False))

{"0": "negative", "1": "neutral", "2": "negative", "3": "negative", "4": "negative", "5": "negative", "6": "neutral", "7": "positive", "8": "positive", "9": "negative", "10": "negative", "11": "negative", "12": "negative", "13": "negative", "14": "negative"}
negative
 neutral
negative
negative
negative
negative
 neutral
positive
positive
negative
negative
negative
negative
negative
negative


  predictions = pd.read_json(output, orient="index")


In [4]:
train = pd.read_csv("../data/validation_sentences.csv")
json_train_sentences = train["language"].to_json()

messages = [{"role": "system", "content": "You are a patient."},
            {"role": "user", "content": "As a patient at a medical center, medical doctors write lots of clinical notes about you.\n"
                                        "Your task is to analyze the sentiment of a series of sentences your doctor wrote about you.\n"
                                        "For each sentence, how do you feel reading this description of you?\n"
                                        "Please assign a sentiment score of negative, neutral, or positive for each sentence.\n"
                                        "Below are some  sentences in JSON format:\n"
                                        f"{json_train_sentences}"
                                        "Please provide your answer in JSON format."}]

response = client.chat.completions.create(model=deployment, messages=messages, temperature=0, seed=42)

output = response.choices[0].message.content
print(output)
predictions = pd.read_json(output, orient="index")
results = compute_metrics(predictions[0], train["PT_label"])
print(results)
error_analysis = pd.concat([train[["language", "MD_PT_label", "PT_label"]], predictions], axis=1)
error_analysis = error_analysis.rename(columns={0:"pred", "PT_label":"true"})
mask = error_analysis["pred"] == error_analysis["true"]
error_analysis = error_analysis[~ mask]
error_analysis.to_csv('error_analysis_zero_shot_test.csv', index=False)

{"0": "neutral", "1": "neutral", "2": "negative", "3": "negative", "4": "negative", "5": "negative", "6": "neutral", "7": "positive", "8": "positive", "9": "negative", "10": "negative", "11": "negative", "12": "negative", "13": "negative", "14": "negative"}
              precision    recall  f1-score   support

    negative       0.80      0.89      0.84         9
     neutral       0.67      0.50      0.57         4
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        15
   macro avg       0.82      0.80      0.80        15
weighted avg       0.79      0.80      0.79        15

{'f1': 0.7909774436090227, 'precision': 0.7911111111111111, 'recall': 0.8}


  predictions = pd.read_json(output, orient="index")


In [8]:
# Hyperparameters (for best configuration selection)
params = {
    'seed': [42],
    'n_neutral_sentences': [0, 1],
    'n_positive_sentences': [0, 1],
    'n_negative_sentences': [0, 1, 2]
}

dataset = "90"
metrics_file = f'PT_context_metrics_{dataset}.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('seed,n_neutral_sentences,n_positive_sentences,n_negative_sentences,f1,precision,recall\n')

best_model = []
best_f1 = 0.0
best_comb, best_results = None, None
for comb in list(ParameterGrid(params)):
    train = pd.read_csv(f"../data/train_{dataset}.csv")
    test = pd.read_csv(f"../data/test_{dataset}.csv")
    # randomly selecting context sentences in json format
    train_sentences = train_sentence_selection(train,
                                               comb['n_neutral_sentences'],
                                               comb['n_positive_sentences'],
                                               comb['n_negative_sentences'],
                                               "PT",
                                               comb['seed'])
    train_sentences.to_csv(f'CLI_sentences_{dataset}.csv', index=False)
    json_train_sentences = train_sentences["language"].to_json()
    json_train_labels = train_sentences["PT_label"].to_json()

    # converting test sentences to json format
    json_test_sentences = test["language"].to_json()

    # creating context prompt
    messages = [{"role": "system", "content": "You are a patient."},
                {"role": "user", "content": "As a patient at a medical center, medical doctors write lots of clinical notes about you.\n"
                                            "Your task is to analyze the sentiment of a series of sentences your doctor wrote about you.\n"
                                            "For each sentence, how do you feel reading this description of you?\n"
                                            "Please assign a sentiment score of negative, neutral, or positive for each sentence.\n"
                                            "Below are some example sentences in JSON format:\n"
                                            f"{json_train_sentences}"
                                            "Please provide your answer in JSON format."},
                {"role": "assistant", "content": f"{json_train_labels}"},
                {"role": "user", "content": f"Complete the same task with each of these sentences:\n{json_test_sentences}"}]

    # running chat completion
    response = client.chat.completions.create(model=deployment, messages=messages, temperature=0, seed=comb['seed'])
    # saving response to json format
    output = response.choices[0].message.content
    predictions = pd.read_json(output, orient="index")
    # computing metrics
    results = compute_metrics(predictions[0], test["PT_label"])
    # saving results to metrics sheet
    v = [comb['seed'], comb['n_neutral_sentences'], comb['n_positive_sentences'], comb['n_negative_sentences'],
         results['f1'], results['precision'], results['recall']]
    f.write(','.join([str(el) for el in v]) + '\n')

    if results['f1'] > best_f1:
        best_f1 = results['f1']
        best_comb = comb
        best_results = results
        best_CLI_sentences = train_sentences
        best_CLI_sentences.to_csv(f'best_CLI_sentences_{dataset}.csv', index=False)
        error_analysis = pd.concat([test[["idx", "language", "MD_PT_label", "PT_label"]], predictions], axis=1)
        error_analysis = error_analysis.rename(columns={0:"pred", "PT_label":"true"})
        mask = error_analysis["pred"] == error_analysis["true"]
        error_analysis = error_analysis[~ mask]
        error_analysis.to_csv(f'error_analysis_{dataset}.csv', index=False)
    print('-' * 100)
    print('\n\n')

    if best_comb is not None:
        print(f'Best combination of context sentences: {best_comb}')
        print('\n')
        print(f'Best results: {best_results}')
f.close()

  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       0.00      0.00      0.00         2
    positive       0.67      1.00      0.80         2

    accuracy                           0.80        10
   macro avg       0.51      0.67      0.57        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 0, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.7138461538461538, 'precision': 0.6476190476190475, 'recall': 0.8}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 0, 'n_neutral_sentences': 0, 'n_positive_sentences': 1, 'seed': 42}


Best results: {'f1': 0.7142857142857142, 'precision': 0.65, 'recall': 0.8}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 0, 'n_neutral_sentences': 0, 'n_positive_sentences': 1, 'seed': 42}


Best results: {'f1': 0.7142857142857142, 'precision': 0.65, 'recall': 0.8}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 0, 'n_neutral_sentences': 0, 'n_positive_sentences': 1, 'seed': 42}


Best results: {'f1': 0.7142857142857142, 'precision': 0.65, 'recall': 0.8}


  predictions = pd.read_json(output, orient="index")


              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       1.00      0.50      0.67         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0.95      0.83      0.86        10
weighted avg       0.91      0.90      0.89        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86         6
     neutral       0.00      0.00      0.00         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.80        10
   macro avg       0.58      0.67      0.62        10
weighted avg       0.65      0.80      0.71        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")


              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       1.00      0.50      0.67         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0.95      0.83      0.86        10
weighted avg       0.91      0.90      0.89        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")


              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       1.00      0.50      0.67         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0.95      0.83      0.86        10
weighted avg       0.91      0.90      0.89        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}


  predictions = pd.read_json(output, orient="index")


              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       1.00      0.50      0.67         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0.95      0.83      0.86        10
weighted avg       0.91      0.90      0.89        10

----------------------------------------------------------------------------------------------------



Best combination of context sentences: {'n_negative_sentences': 1, 'n_neutral_sentences': 0, 'n_positive_sentences': 0, 'seed': 42}


Best results: {'f1': 0.8871794871794872, 'precision': 0.9142857142857143, 'recall': 0.9}
              precision    recall  f1-score   support

    negative       0.86      1.00      0.92         6
     neutral       1.00      0.50      0.67         2
    positive       1.00      1.00      1.00         2

    accuracy                           0.90        10
   macro avg       0

  predictions = pd.read_json(output, orient="index")
