In [None]:
import openai
import pickle
from Bio import Entrez
import pandas as pd
import os
from openai import OpenAI, RateLimitError, APIError, OpenAIError
import json
import numpy as np
import xml
from xml.dom.minidom import Document
import copy
import tiktoken
import time
import re
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import random

In [None]:
pd.set_option("display.max_rows", 100)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
try:
    client = OpenAI(
        api_key=os.environ.get("OPENAI_API_KEY"),
        organization='',
        project='',
    )
except:
    print('Something goes wrong on the connection with chatGPT.')

In [None]:
def requisition_chat(gpt_model, prompt):
    """
    This function communicates with the chatGPT API. It sends a input and returns the model's output.
    """
    try:
        completion = client.chat.completions.create(
        model=gpt_model,        
        response_format = {"type":"json_object"},
        messages=[
        {"role": "system", "content": prompt}    
        ]
        )
    except RateLimitError as e:
        print("Rate limit exceeded. Retrying after a delay...")
        time.sleep(60)  # Wait for 10 seconds before retrying
        return requisition_chat(gpt_model, prompt)    
    except APIError as e:
        print(f"API error: {e}")  
        return False
    except OpenAIError as e:
        print(f"An error occurred: {e}")
        time.sleep(5)  # Wait for 5 seconds before retrying        
        return requisition_chat(gpt_model, prompt)
    
    return completion

### General settings

In [None]:
research_question = "What is the diagnostic accuracy of the no-biopsy approach (10- fold increase in serum IgA antitissue transglutaminase (tTG) antibody levels) for the diagnosis of coeliac disease in adults?"

In [None]:
models = ['gpt-3.5-turbo', 'gpt-4', 'gpt-4o-mini', 'gpt-4o', 'text-davinci-003', 'gpt-4-turbo']

In [None]:
gpt_model = models[0]

### Getting a list of keywords

In [None]:
n_keywords = 20

In [None]:
json_schema_search_s = {"keywords": ["", "", ""]}

In [None]:
prompt = f'You are a researcher conducting a systematic review to answer the following research question: {research_question} \
You need to provide {n_keywords} specific keywords that can be used in an in-depth and extensive PubMed search. \
Please provide a comprehensive search string using JSON format with no additional description or context. \
The data schema should be like this: ' + json.dumps(json_schema_search_s)

In [None]:
output_k = requisition_chat(gpt_model, prompt)

In [None]:
search_k = output_k.choices[0].message.content
search_k = json.loads(search_k)
search_k = search_k['keywords']
search_k

### Getting inclusion and exclusion criteria on chatGPT

In [None]:
json_schema_criteria = {"inclusion_criteria": [{"type":"population", "criteria":"..."}], "exclusion_criteria":[{"type":"population", "criteria":"..."}]}

In [None]:
prompt = f'You are a researcher conducting a systematic review to answer the following research question: "{research_question}" \
Now, you need to provide inclusion and exclusion criterias based on the research question. These criterias \
will be used to screening papers and define the ones that shold be included on the review. Use JSON format with no additional description or context. \
The data schema should be like this: {json_schema_criteria}'

output_criteria = requisition_chat(gpt_model, prompt)

In [None]:
# it checks if the requisition received the response correctly
if output_criteria.choices[0].finish_reason == 'stop':
    response = output_criteria.choices[0].message.content
    response = json.loads(response)
    inclusion_criteria_dicts = response['inclusion_criteria']
    exclusion_criteria_dicts = response['exclusion_criteria']
else:
    print("The response of the model is not complete. Reason:", output_criteria.choices[0].finish_reason)

In [None]:
get_criterias = lambda criteria_dict: criteria_dict['criteria']

In [None]:
exclusion_criteria_list = list(map(get_criterias, exclusion_criteria_dicts))
inclusion_criteria_list = list(map(get_criterias, inclusion_criteria_dicts))

### Constructing the dataset on PubMed

In [None]:
# Set your email here
Entrez.email = ""

In [None]:
def search_pubmed(query, max_results=10):
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

In [None]:
def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    return records

In [None]:
def parse_records(records):
    articles = []        
    for record in records['PubmedArticle']:        
        article = {}
        article['PMID'] = record['MedlineCitation']['PMID']
        article['Keywords'] = record['MedlineCitation']['KeywordList'][0] if len(record['MedlineCitation']['KeywordList']) > 0 else 'No keyword list available'
        article['Country'] = record['MedlineCitation']['MedlineJournalInfo']['Country']
        article['Language'] = record['MedlineCitation']['Article']['Language'][0]        
        article['Title'] = record['MedlineCitation']['Article']['ArticleTitle']
        article['Abstract'] = record['MedlineCitation']['Article']['Abstract']['AbstractText'][0] if 'Abstract' in record['MedlineCitation']['Article'] else 'No abstract available'
        article['Journal'] = record['MedlineCitation']['Article']['Journal']['Title']
        article['PubYear'] = record['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate'].get('Year', 'No date available')
        article['ISSN'] = record['MedlineCitation']['Article']['Journal']['ISSN'] if 'ISSN' in record['MedlineCitation']['Article']['Journal'] else 'No ISSN available'
        articles.append(article)
    return articles

In [None]:
def pubmed(search_):
    id_list = search_pubmed(search_, max_results=20000)
    if not id_list:
        print("No articles found.")
        return
    
    records = fetch_details(id_list)
    
    articles = parse_records(records)
    
    df = pd.DataFrame(articles)
    
    return df

In [None]:
def main(search_, type_search='search_string'):
    if type_search=='search_string':
        df = pubmed(search_)
    elif type_search=='keywords':
        df = pd.DataFrame()
        a = 0
        for keyword in search_:
            aux = pubmed(keyword)
            a += len(aux)
            df = pd.concat([df, aux], axis = 0)
                                        
    return df

In [None]:
df1 = main(search_k[:3], 'keywords')
df2 = main(search_k[3:6], 'keywords')
df3 = main(search_k[6:9], 'keywords')
df4 = main(search_k[9:12], 'keywords')
df5 = main(search_k[12:15], 'keywords')
df6 = main(search_k[15:18], 'keywords')
df7 = main(search_k[18:], 'keywords')

In [None]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis = 0)

In [None]:
df.to_csv('df_papers.csv', index = False)

### Filtering

In [None]:
df = pd.read_csv('df_papers.csv')

In [None]:
# Drop duplicates rows

df = df.drop_duplicates(subset=['PMID'], keep='first')

In [None]:
# maintaning just english papers

df = df[df.Language == 'eng']

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove Markdown links
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # Remove Markdown bold and italic
    text = re.sub(r'\*\*|__', '', text)  # bold
    text = re.sub(r'\*|_', '', text)  # italic
    # Remove inline code
    text = re.sub(r'`.*?`', '', text)
    return text

# Apply the function to the DataFrame
df['Title'] = df['Title'].apply(clean_text)
df['Abstract'] = df['Abstract'].apply(clean_text)

In [None]:
df['Title'] = df['Title'].str.lower()
df['Title'] = df['Title'].str.rstrip('.')

## First Screening

In [None]:
quant_papers = 1000
batch_size = 25

In [None]:
json_schema_1st = {'gpt_decision': [{'Title':'', 'Decision':'include', 'Reason': ''}, {'Title':'', 'Decision':'exclude', 'Reason': ''}]}

In [None]:
array_title = np.array(df.Title)
array_abstract = np.array(df.Abstract)

In [None]:
messages = []

for i in range(0, quant_papers):    
    a = "Title: " + array_title[i] + "\n Abstract: " + array_abstract[i]        
    messages.append(a)

messages = np.array(messages)

In [None]:
responses = []

In [None]:
# Create a single prompt with all messages in the batch
prompt = f"You are a researcher conducting a systematic review to answer the following research question: {research_question} \
You need to screen the papers' titles and abstracts and decide whether each paper should be included or excluded in the research using exclusion criteria. \
If any exclusion criteria are met, exclude the article. \
Exclusion criteria: {'; '.join(exclusion_criteria_list)}. \
You also need to describe the reason for including or excluding the paper. \
Use JSON format with no additional description or context. \
The data schema should be like this: {json_schema_1st}."

In [None]:
def check_responses(batch, response_text):
    resp = json.loads(response_text).get('gpt_decision')
    if len(resp) != len(batch):
        print('Missing ', len(resp) - len(batch), 'papers')
        return False
    return True


def missing_responses(batch, response_text):
    resp_df = pd.DataFrame(json.loads(response_text).get('gpt_decision'))
    title_set = list(resp_df['Title'])
    indxes = []
    for item in range(0, len(batch)):    
        full = batch[item].split('\n ')[0]
        title = batch[item].split('\n ')[0].split('Title: ')[1]        
        if title not in title_set:
            indxes.append(item)
            print('Missing the decision of: ', title)
    return indxes
    
    
    
def send_batched_requests(messages, prompt, batched_responses= [], model=gpt_model, batch_size=25):
    # Function to send batched requests
     

    # Split messages into batches
    for i in range(0, len(messages), batch_size):
        batch = messages[i:i + batch_size]        
        
        string = '\n'.join(batch)                
        
        prompt_s = prompt + string        
        
        encoding = tiktoken.encoding_for_model(gpt_model)
        token_count = len(encoding.encode(prompt_s))        
        
        print(f"The input text contains {token_count} tokens.")

        # Send request to the API
        try:
            response = requisition_chat(gpt_model, prompt_s)        
            
            # Append the response to batched_responses 
            responses_text = response.choices[0].message.content        

            token_o = len(encoding.encode(responses_text))
            print(f"The output text contains {token_o} tokens.")

            # if check_responses(batch, responses_text):
            batched_responses.append(responses_text)                                

            if check_responses(batch, responses_text) is False:
                # Re-query the model for missing responses
                time.sleep(1)  # Sleep to avoid rate limit            
                idx_partial_responses = missing_responses(batch, responses_text)            
                print('Re-query the model for missing responses')
                batched_responses = send_batched_requests(batch[idx_partial_responses], prompt, batched_responses, gpt_model, batch_size)     
        except:            
            pass
               
        
        
        # Rate limiting to avoid hitting API limits
        time.sleep(2)  # Adjust based on your rate limit

    return batched_responses

In [None]:
responses = send_batched_requests(messages, prompt, batched_responses= [])

In [None]:
l_responses = []
for response in responses:
    l_responses.append(json.loads(response))

In [None]:
df_decision = pd.DataFrame()
a = 0
for i in range(0, len(l_responses)):
    a +=  len(l_responses[i].get('gpt_decision'))
    df_decision = pd.concat([df_decision, pd.DataFrame(l_responses[i].get('gpt_decision'))])
df_decision.reset_index(inplace = True, drop=True)    

In [None]:
df_decision = df_decision[['Title', 'Decision', 'Reason']]
df_decision = df_decision.rename(columns={'Title': 'title', 'Decision':'decision', 'Reason': 'reason'})

In [None]:
df_decision.title.duplicated()[df_decision.title.duplicated() == True]

In [None]:
df_decision.drop_duplicates(keep='first', inplace=True)
df_decision.reset_index(drop=True, inplace=True)

In [None]:
df_decision.to_csv('1st_decision.csv', index = False)

In [None]:
df_decision

In [None]:
mask = df_decision['title'].str.contains('combining antibody tests and taking into acc')
result = df_decision[mask]
result

In [None]:
df.loc[papers_].Title

## Second screening

In [None]:
df_decision = pd.read_csv('1st_decision.csv')

In [None]:
 # Set max column width to None to display full title
pd.set_option('display.max_colwidth', 50) 
pd.set_option('display.width', 1000) 

In [None]:
null_mask = df_decision.isnull().any(axis=1)
df_decision[null_mask].title

In [None]:
included_papers = df_decision[df_decision.decision == 'include'].reset_index(drop=True)

In [None]:
included_papers.rename(columns={'title': 'Title'}, inplace=True)
included_papers['Title'] = included_papers['Title'].str.rstrip('.')

In [None]:
included_papers['Title'] = included_papers['Title'].str.lower()

In [None]:
included_papers = included_papers.merge(df[['PMID', 'Title', 'Abstract', 'Country', 'PubYear']], how='left', on='Title')

In [None]:
null_mask = included_papers.isnull().any(axis=1)
included_papers = included_papers[~null_mask]

In [None]:
included_papers.PMID = included_papers.PMID.astype(int)

In [None]:
included_papers