In [1]:
import openai  
from openai import OpenAI
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import openpyxl
import re

In [3]:
tested_criterion = 7 # the (i-1)th question in the list
test_output='./new_test_output_All_criterion'+str(tested_criterion+1)+'.csv' # distinguish the output file name for each criterion

criteria_list = ["1. Does the news release adequately discuss the costs of the intervention?", 
"2. Does the news release adequately quantify the benefits of the treatment/test/product/procedure?", 
"3. Does the news release adequately explain/quantify the harms of the intervention?", 
"4. Does the news release seem to grasp the quality of the evidence?",
"5. Does the news release commit disease-mongering?", 
"6. Does the news release use independent sources and identify conflicts of interest?", 
"7. Does the news release compare the new approach with existing alternatives?", 
"8. Does the news release establish the availability of the treatment/test/product/procedure?", 
"9. Does the news release establish the true novelty of the approach?"]

In [None]:
# new prompt with only one question/criterion
# prompt = """Answer the following question for each article, which will be provided later:"""+criteria_list[tested_criterion]+""" Code the answer as 'satisfactory' if the answer is 'yes,' and 'not satisfactory' if the answer is 'no.' After evaluation, explain your reasoning, ensuring that it directly addresses the question and is supported by original statements from the article. Use this format for your response: answer: [satisfactory/not satisfactory], reasoning: [your reasoning here]. In cases of ambiguity in the article, provide your best-guess reasoning. 
#  """

In [4]:
prompt = """
Evaluate the article based on the following criterion: Does the news establish the availability of the treatment, test, product, or procedure? To be coded as 'satisfactory,' the article should clearly describe the current availability status and the necessary steps remaining before it can become available, if it is not already. This includes:

1. Current Status: The article should state whether the intervention is currently available to the public, and if so, in what capacity and at which locations.

2. FDA Approval: If the intervention is not yet available, the article must specify whether it is FDA-approved for the use under discussion or detail the remaining steps before such approval can be obtained. Avoid giving undue weight to speculative predictions about future approvals.

3. International Availability: If the intervention is available in other countries but not in the U.S., the article should explore why it is not yet approved or available in the U.S.

4. Accessibility and Distribution: For interventions that are approved or don't require approval (like new surgical methods or exercise equipment), the article should address questions of accessibility, such as whether patients must travel to specialized centers or if the product will be widely distributed and readily available at local facilities.

Code the answer as 'satisfactory' if the article addresses any of the above standards. Otherwise, code the answer as 'not satisfactory.' After evaluation, provide reasoning that directly the points made, supported by original statements from the article. Use this format for your response:
answer: [satisfactory/not satisfactory], reasoning: [your reasoning here, addressing how standards are met]. In cases of ambiguity in the article, provide your best-guess reasoning based on the available information. """


In [5]:
import ast
def parse_json(json_str):
    res = ast.literal_eval(json_str.strip())
    # print(res)
    numeric_res = {}
    for c in res:
        c_numeric = re.search('[0-9]+', c).group()
        if c_numeric not in numeric_res:
            numeric_res[c_numeric] = res[c]
    return numeric_res

In [6]:
# Setting up OpenAI object
client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="***************",
)

## Read in data file

In [7]:
data_file = '/Users/xiaoyu/Desktop/untitled folder/dataset_edited_1_17.xlsx'
df = pd.read_excel(data_file)
df.head()

Unnamed: 0,status,source_Date,source_Category,source_OrginalTitle,source_ReviewTitle,source_Summary,source_Link,ReviewTtile,Category,Tag,...,1. Does the news release adequately discuss the costs of the intervention?,2. Does the news release adequately quantify the benefits of the treatment/test/product/procedure?,3. Does the news release adequately explain/quantify the harms of the intervention?,4. Does the news release seem to grasp the quality of the evidence?,5. Does the news release commit disease-mongering?,6. Does the news release use independent sources and identify conflicts of interest?,7. Does the news release compare the new approach with existing alternatives?,8. Does the news release establish the availability of the treatment/test/product/procedure?,9. Does the news release establish the true novelty of the approach?,10. Does the news release appear to rely solely or largely on a news release?
0,PROCESSED,2017-08-07,News Release Review,Resistance training may slow down the progress...,Claims but no context in summary of resistance...,No numbers are given to back up its benefit cl...,https://www.healthnewsreview.org/news-release-...,Claims but no context in summary of resistance...,University news release,"Aarhus University, multiple sclerosis",...,Not Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory,Satisfactory,Satisfactory,Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory
1,PROCESSED,2017-10-26,News Release Review,Genetic test could help fight secondary breast...,"Benefits, evidence, cost and harms all missing...",Most of the missing content was in the publish...,https://www.healthnewsreview.org/news-release-...,"Benefits, evidence, cost and harms all missing...",Breast cancer|University news release,"bisphosphonates, genetic testing, University o...",...,Not Satisfactory,Not Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory,Satisfactory,Satisfactory
2,PROCESSED,2018-06-05,News Release Review,Adding Atezolizumab Immunotherapy to Chemother...,ASCO offers a positive spin on cancer drug sur...,While researchers saw slightly slower cancer g...,https://www.healthnewsreview.org/news-release-...,ASCO offers a positive spin on cancer drug sur...,Association/Society news release|Cancer,"ASCO, immunotherapy",...,Not Satisfactory,Not Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory,Satisfactory,Satisfactory,Satisfactory,Satisfactory,Not Satisfactory
3,PROCESSED,2018-08-10,News Release Review,New emerging research suggests Montmorency tar...,Big Cherry claims about health benefits leave ...,This release reads like an advertisement with ...,https://www.healthnewsreview.org/news-release-...,Big Cherry claims about health benefits leave ...,industry/commercial news releases,"cherries, Cherry Marketing Institute, gut health",...,Not Satisfactory,Not Satisfactory,Not Satisfactory,Not Satisfactory,Not Applicable,Satisfactory,Not Satisfactory,Satisfactory,Satisfactory,Not Satisfactory
4,PROCESSED,2018-09-13,News Release Review,Nucleix Announces Positive Clinical Results fo...,Recap of lung cancer screening test relies on ...,"The release includes no mention of cost, harms...",https://www.healthnewsreview.org/news-release-...,Recap of lung cancer screening test relies on ...,industry/commercial news releases,"lung cancer screening, Nucleix",...,Not Satisfactory,Not Satisfactory,Not Satisfactory,Not Satisfactory,Satisfactory,Not Applicable,Satisfactory,Satisfactory,Not Satisfactory,Not Satisfactory


In [8]:
df.columns

Index(['status', 'source_Date', 'source_Category', 'source_OrginalTitle',
       'source_ReviewTitle', 'source_Summary', 'source_Link', 'ReviewTtile',
       'Category', 'Tag', 'OriginalTitle', 'OrginalArticleLink', 'Evaluation',
       'TotalScore', 'fulltext',
       '1. Does the news release adequately discuss the costs of the intervention?',
       '2. Does the news release adequately quantify the benefits of the treatment/test/product/procedure?',
       '3. Does the news release adequately explain/quantify the harms of the intervention?',
       '4. Does the news release seem to grasp the quality of the evidence?',
       '5. Does the news release commit disease-mongering?',
       '6. Does the news release use independent sources and identify conflicts of interest?',
       '7. Does the news release compare the new approach with existing alternatives?',
       '8. Does the news release establish the availability of the treatment/test/product/procedure?',
       '9. Does the news

In [9]:
test = df.iloc[0:2134]

In [None]:
# print(test)

In [10]:
'''
I have modified the method of exporting responses to Excel. 
This change ensures that responses are preserved and can be appended to later even if ChatGPT stops responding.
'''

import os

def generate_response(df, client, prompt, model, output_file):
    for idx, row in df.iterrows():
        source = row['fulltext']
        print("row", idx, ":", pd.Timestamp.now())  #track the response progress
        
        try:
            response = client.chat.completions.create(
                model=model,
                response_format={"type": "json_object"},
                messages=[
                    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
                    {"role": "user", "content": prompt + '\n The news article is: \n' + source}
                ]
            )
            df.loc[idx, 'response'] = response.choices[0].message.content
            df.loc[idx,'index']=idx
        except Exception as e:
            print(f"An error occurred on row {idx}: {e}")
            df.loc[idx, 'response'] = "Error: " + str(e)
        
        # Check if the file exists and is not empty to decide whether to write headers
        if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
            # File exists and is not empty, do not write headers
            write_header = False
        else:
            # File does not exist or is empty, write headers
            write_header = True
        
        # Append the current row to the CSV file, writing headers if needed
        df.loc[[idx]].to_csv(output_file, mode='a', header=write_header, index=False)

    return df

In [11]:
test_res = generate_response(test, client, prompt, "gpt-3.5-turbo-1106", test_output)

row 0 : 2024-03-06 13:49:01.508234


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[idx, 'response'] = response.choices[0].message.content
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[idx,'index']=idx


row 1 : 2024-03-06 13:49:04.547174
row 2 : 2024-03-06 13:49:07.068302
row 3 : 2024-03-06 13:49:12.521424
row 4 : 2024-03-06 13:49:14.866401
row 5 : 2024-03-06 13:49:16.946452
row 6 : 2024-03-06 13:49:19.444840
row 7 : 2024-03-06 13:49:22.670990
row 8 : 2024-03-06 13:49:25.624389
row 9 : 2024-03-06 13:49:27.812179
row 10 : 2024-03-06 13:49:30.331040
row 11 : 2024-03-06 13:49:33.169279
row 12 : 2024-03-06 13:49:36.742867
row 13 : 2024-03-06 13:49:39.559263
row 14 : 2024-03-06 13:49:41.902240
row 15 : 2024-03-06 13:49:44.165164
row 16 : 2024-03-06 13:49:46.592911
row 17 : 2024-03-06 13:49:48.850956
row 18 : 2024-03-06 13:49:52.573733
row 19 : 2024-03-06 13:49:54.978057
row 20 : 2024-03-06 13:49:57.617193
row 21 : 2024-03-06 13:50:00.051011
row 22 : 2024-03-06 13:50:01.630783
row 23 : 2024-03-06 13:50:04.844273
row 24 : 2024-03-06 13:50:07.189904
row 25 : 2024-03-06 13:50:09.702339
row 26 : 2024-03-06 13:50:12.117849
row 27 : 2024-03-06 13:50:14.534766
row 28 : 2024-03-06 13:50:17.279246
r

In [None]:
#test_res

In [None]:
#test_res['response'][0]

In [None]:
# def filter_incomplete_response(df, num_criteria, 
#                                incomplete_data_output = './incomplete_data.csv',
#                                complete_data_output = './complete_data.csv'):
#     incomplete = []
#     for idx, row in df.iterrows():
#         response = parse_json(row['response'])
#         if len(response) < num_criteria:
#             incomplete.append(row)
#             # print(df['response'][idx])
#             df.drop(idx,inplace=True) # need to specify that the change is applied to the original dataframe in place
            
#     incomplete_df = pd.DataFrame(incomplete)
#     incomplete_df.to_csv(incomplete_data_output)
#     df.to_csv(complete_data_output)

#     return df

In [None]:
# df_filtered = filter_incomplete_response(test_res, len(criteria_list))
# len(df_filtered)

In [None]:
# df_filtered.columns

In [None]:
# import json
# def evaluate_response(df):
#     c = criteria_list[tested_criterion]  # evaluate the given criterion in the list
#     eval_dict = {}
#     # c_num = c.split('.')[0]
#     # print(c_num)
#     eval_dict[c] = {'gold_standard':[], 'predicted':[]}
#     for idx, row in df.iterrows():
#         print("index",idx)
#         gold_standard = row[c].lower()
#         response = json.loads(row['response'])
#         predicted = response["answer"].lower()
#         # print("gold:",gold_standard, "predicted:",predicted)
#         eval_dict[c]['gold_standard'].append(gold_standard)
#         eval_dict[c]['predicted'].append(predicted)
#     print(str(c))
#     print('gold_standard',eval_dict[c]['gold_standard'])
#     print('predicted',eval_dict[c]['predicted'])
#     print(c, accuracy_score(eval_dict[c]['gold_standard'], eval_dict[c]['predicted']))
#     cm = confusion_matrix(eval_dict[c]['gold_standard'], eval_dict[c]['predicted'])
#     disp = ConfusionMatrixDisplay(confusion_matrix = cm)
#     disp.plot()
#     plt.show()

In [23]:
import json
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import label_binarize

def evaluate_response(df):
    c = criteria_list[tested_criterion]  # Evaluate the given criterion in the list
    eval_dict = {}
    eval_dict[c] = {'gold_standard': [], 'predicted': []}
    for idx, row in df.iterrows():
       
        #Identify the affected rows that cause the decoding issues of JSON content
        try:
            response = json.loads(row['response'])
            if 'answer' in response:
                predicted = response["answer"].lower()
            else:
                print(f"'answer' key is missing in the JSON response for row {idx}")
                continue
            
            gold_standard = row[c].lower()
            # Check if gold_standard is "not applicable" and change it to "not satisfactory"
            if gold_standard == 'not applicable':
                gold_standard = 'not satisfactory'
            
            eval_dict[c]['gold_standard'].append(gold_standard)
            eval_dict[c]['predicted'].append(predicted)

        except json.JSONDecodeError as e:
            # If there's a JSON decoding error, print the error and skip this row
            print(f"Error decoding JSON for row {idx}: {e}")
            print(f"Faulty JSON content: {row['response']}")
            continue

    # Convert gold standard and predicted to a binary format for AUC calculation
    classes = sorted(list(set(eval_dict[c]['gold_standard']) | set(eval_dict[c]['predicted'])))
    y_true = label_binarize(eval_dict[c]['gold_standard'], classes=classes)
    y_pred = label_binarize(eval_dict[c]['predicted'], classes=classes)

    # Calculating metrics
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred,average='weighted')
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    false_positive_rate = fp / (fp + tn)
    true_negative_rate = tn /( tn +fp)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Output the results
    print(criteria_list[tested_criterion])
    print(f'Accuracy: {accuracy}')
    print(f'AUC Score: {auc}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'True Negative Rate: {true_negative_rate}')

In [24]:
evaluate_response(pd.read_csv("/Users/xiaoyu/Desktop/Ongoing Research/ChatGPT-empirical paper/Pilot Study_0126/Full copy/new_test_output_All_criterion8.csv"))

'answer' key is missing in the JSON response for row 974
Error decoding JSON for row 1467: Expecting value: line 1 column 1 (char 0)
Faulty JSON content: Error: Error code: 400 - {'error': {'message': "Sorry! We've encountered an issue with repetitive patterns in your prompt. Please try again with a different prompt.", 'type': 'invalid_request_error', 'param': 'prompt', 'code': 'invalid_prompt'}}
8. Does the news release establish the availability of the treatment/test/product/procedure?
Accuracy: 0.5394366197183098
AUC Score: 0.5628199030665398
Precision: 0.5952144246695643
Recall: 0.5394366197183098
F1 Score: 0.5444176659421709
True Negative Rate: 0.6540404040404041
