<a href="https://colab.research.google.com/github/youkiti/ARE/blob/main/2023_11_1Azure_DTA_abstract_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#About


*   You can use this notebook to classify abstracts as either diagnostic test accuracy studies or not.
*   You need
1.   to set up the Azure OpenAI API.
2.   RIS files of abstracts in the Google drive



# Setup

In [None]:
!pip install -q openai cohere tiktoken rispy

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h


You should use Google Drive to store your files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from openai import AzureOpenAI
import json
import pandas as pd
import os
import glob
import math
import numpy as np
import glob
import rispy
from pathlib import Path
import time
from tqdm import tqdm
from datetime import datetime
from sklearn.metrics import confusion_matrix

#Please apply your own settings to the following cell

*   If you dont know the path please click the left folder button.
image.png


*   Then click the drive folder and seek.

In [None]:
#Please enter your API Key
API_VERSION = "2023-07-01-preview"
API_KEY = "YOUR OWN API KEY"
RESOURCE_ENDPOINT = "YOUR OWN ENDPOINT"

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
client = AzureOpenAI(
    api_version=API_VERSION,
    api_key = API_KEY,
    # https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#create-a-resource
    azure_endpoint=RESOURCE_ENDPOINT,
)

#please enter your OWN deploy names
deployment_ids = ["gpt-35-turbo-0613"]

#Please set your folder includes RIS files
folderPath = "YOUR OWN FOLDER"

#Processing Bibliographic Information and Creating a Dataset

In [None]:
os.chdir(folderPath)
#search ris files
risfiles = glob.glob("*.ris")
print("Number of RIS files:")
print(len(risfiles))

Number of RIS files:
2


In [None]:
#ris to dataframe
#https://github.com/MrTango/rispy

def ris_to_df(ris_path):
    p = Path(ris_path)

    # Open the file and read the data
    with p.open() as f:
        # Remove lines starting with 'Link to the Ovid Full Text or citation:' and join the remaining lines into a string
        data = "".join([line for line in f if not line.startswith("Link to the Ovid Full Text or citation:")])

    # Parse the RIS data
    entries = rispy.loads(data)
    print("Number of abstracts in the RIS file:")
    print(len(entries))

    # Convert to a dataframe
    df = pd.json_normalize(entries)
    print("Column names:")
    print(df.columns)
    return df

#read
df_ris = pd.DataFrame()
for i in range(len(risfiles)):
    df = ris_to_df(risfiles[i])
    df_ris = pd.concat([df_ris,df])

print("final dataframe:")
print(df_ris.shape)

In [None]:
def select_title(row):
    if 'title' in row and pd.notna(row['title']):
        return row['title']
    elif 'primary_title' in row and pd.notna(row['primary_title']):
        return row['primary_title']
    elif 'secondary_title' in row and pd.notna(row['secondary_title']):
        return row['secondary_title']
    elif 'tertiary_title' in row and pd.notna(row['tertiary_title']):
        return row['tertiary_title']
    return np.nan


df = pd.DataFrame()
#If this cell doesnt work, check the Column names of df_ris
df['selected_title'] = df_ris.apply(select_title, axis=1)
df['abstract'] = df_ris['abstract']

df["tiab"] = df["selected_title"].fillna('').astype(str) + " " + df["abstract"].fillna('').astype(str)
df.head()

#Azure

In [None]:
#best prompt for DTA SR
prompt = ("Please determine if an abstract is a Diagnostic Test Accuracy (DTA) study based on the following criteria: "

"1. A DTA study evaluates a test against a clinical reference standard specifically for humans, with very high sensitivity and reasonable specificity."
"2. Include multivariable diagnostic prediction model studies."
"3. Exclude the following: "
"   - Prognostic prediction model studies where predictors and outcomes are measured at different time points. "
"   - Modeling studies. "
"   - Studies assessing diagnostic training for medical professionals. "

"Reply with 'True' if the abstract is a DTA study or if there is insufficient information to judge (e.g., when only a title is available). Reply with 'False' if you are certain that the abstract is not a DTA study."
)

In [None]:
#returns a processed JSON when given an abstract
def process_of_abstracts(abstract, prompt, model_name,temperature):
    #prompting
    question = prompt + str(abstract)

    #function calling
    functions=[
        {
            "name":"dta_filter",
            "description":"classify diagnostic test accuracy abstracts",
            "parameters":{
                "type":"object",
                "properties":{
                    "judgement":{
                        "type":"string",
                        "description":"Determining if the abstract is an abstract of diagnostic test accuracy study. The return must be 'True' or 'False'."
                    },
                    "probability":{
                        "type":"string",
                        "description":"0 to 1 possibility that the abstract is an abstract of diagnostic test accuracy study."
                    }
                    },
                #"required": ["judgement","probability"]
            }
        }
    ]

    # Azure API
    response = client.chat.completions.create(
        model=model_name,
        messages=[{
            "role": "user",
            "content": question
            }],
        functions=functions,
        function_call = "auto",
        temperature = temperature
    )

    #Analyze the results in JSON format.
    response_json = response.model_dump_json()
    response_json = json.loads(response_json)
    response_json = response_json["choices"][0]["message"]["function_call"]["arguments"]
    response_json = json.loads(response_json)
    return response_json

#%%　process a DataFrame
def process_abstracts_to_dataframe(df_train,prompt, model_name, temperature):
    reviews = []
    errors = 0
    MAX_RETRIES = 3 #  retry

    for index, tiab in tqdm(enumerate(df_train["tiab"])):
        res_json = {}  # Initialize res_json
        retries = 0
        success = False
        while retries < MAX_RETRIES and not success:
            try:
                res_json = process_of_abstracts(abstract=tiab, prompt=prompt, model_name=model_name, temperature=temperature)

                # judgementの値がTrueまたはFalseであるか確認
                if res_json['judgement'] not in ["True", "False"]:

                    raise ValueError("judgement value is not True or False")

                # インデックス情報をres_jsonに追加
                res_json['index'] = index
                reviews.append(res_json)
                success = True

            except Exception as e:
                print(f"Error occurred: {e}. Retrying {retries + 1}/{MAX_RETRIES}")
                retries += 1
                time.sleep(1)  # 1秒待つ

        if retries == MAX_RETRIES:
            print(f"Failed to process abstract after {MAX_RETRIES} retries.")
            res_json['index'] = index
            reviews.append(res_json)
            errors += 1

    print(f"Total errors occurred: {errors}")

    # カラム名を定義
    now = datetime.now()
    date_str = now.strftime('%Y-%m-%d')  # e.g., "2023-08-08"
    time_str = now.strftime('%H-%M')  # e.g., "14:35"
    dtime = date_str +"_" + time_str
    column_name = dtime+ "_Azure_"+"_"+ model_name +"_"+ "temp" + str(temperature)

    # インデックスとレビューの情報を格納するリスト
    data = [(review['index'], review) for review in reviews]

    # DataFrameを作成
    df_reviews = pd.DataFrame(data, columns=['index', column_name])

    # インデックスを設定
    df_reviews.set_index('index', inplace=True)

    return df_reviews





#run
*   Change the model according to the needs.
*   Temperature should ideally be 0.




In [None]:
labeled_df = process_abstracts_to_dataframe(df_train = df, prompt=prompt, model_name= deployment_ids[0], temperature = 0)

#Save

In [None]:
labeled_df['judgement'] = labeled_df.iloc[:,0].apply(lambda x: x['judgement'] == 'True')

df_ris = df_ris.join(labeled_df['judgement'])

# save
df_ris.to_excel('bibliofromRIS_labeled.xlsx', index=False)

#select the candidate abstracts
df_ris_to_export = df_ris[df_ris['judgement'] == True]
df_ris_to_export = df_ris_to_export.drop(columns=['judgement'])

print("exported abstracs: "+ str(df_ris_to_export.shape[0]))

df_ris_to_export = df_ris_to_export.fillna('')
# to list
export_ris = df_ris_to_export.to_dict('records')
# export
with open('selected_abstracts.ris', 'w') as bibliography_file:
    rispy.dump(export_ris, bibliography_file)