In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import openai

# Define your OpenAI API key
api_key = 'OpenAI_key'
openai.api_key = api_key

# Define the list of paper references
paper_references = [
    "Noetel, Michael, et al. Effect of exercise for depression: systematic review and network meta-analysis of randomised controlled trials. bmj 384 (2024).",
    "Brown, Jeremy P., et al. Quantifying possible bias in clinical and epidemiological studies with quantitative bias analysis: common approaches and limitations. bmj 385 (2024).",
    "Kauttonen J., Khan U.A., Aunimo L., Topic mining for thesis and job ads in ICT sector: can higher education institutes respond to job market demands, Frontiers in Education , Vol. 9, 2024",
    "Lupo, A., et al. Biomarqueurs prédictifs de l’immunothérapie anti-PD1/PD-L1 dans le cancer broncho-pulmonaire non à petites cellules. Revue de Pneumologie Clinique 74.5 (2018): 339-350."

]


# Function to scrape publication year, title, and abstract from Google Scholar
def scrape_paper_details(paper_reference):
    search_url = "https://scholar.google.com/scholar"
    params = {
        'q': paper_reference,
        'hl': 'en'
    }
    response = requests.get(search_url, params=params)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        # Extract the first result
        result = soup.find('div', class_='gs_ri')
        title = result.find('h3', class_='gs_rt').text
        abstract = result.find('div', class_='gs_rs').text
        publication_info = result.find('div', class_='gs_a').text
        publication_year = [int(s) for s in publication_info.split() if s.isdigit()][0]
    except Exception as e:
        print(f"Error extracting details for reference: {paper_reference} - {str(e)}")
        title = "n/a"
        abstract = "n/a"
        publication_year = "n/a"

    details_dict = {
        "reference": paper_reference,
        "title": title,
        "abstract": abstract,
        "publication_year": publication_year
    }
    return details_dict

# Function to query GPT-4 and retrieve information based on title and abstract
def analyze_paper_details(title, abstract):
    user_message = f"Analyze the following title and abstract of a paper, determine article type (whether it is a review or original article), \
    clinical study (whether it is a clinical/medical study. Answer in yes or no), and the written language (For example, English, French, etc.). \
    A review article summarizes and synthesizes existing research on a particular topic, providing a comprehensive overview of the current \
    understanding and highlighting gaps in knowledge. An original research article presents new findings from an original study conducted by \
    the authors, including the methodology, data analysis, and interpretation of results. A clinical study is related to any medical related\
    original research article or review article.\
    Possible values for these variables are as follows. article type:review article or original research article, clinical study: yes or no, \
    written language: English, German, French or any other language of the title and the abstract. Do not output anything else except these possible\
    values. If there is no valid title and/or abstract, output 'n/a' for all variables.\n \
    Title: {title}\nAbstract: {abstract}"

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [
            {"role": "system", "content": "You are an assistant that helps retrieve information about academic papers."},
            {"role": "user", "content": user_message}
        ],
        "max_tokens": 2000,
        "temperature": 0.1
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    response.raise_for_status()  # Raise an exception for HTTP errors
    response_json = response.json()

    result = response_json['choices'][0]['message']['content']
    details = result.split('\n')
    # print(details)  # Print the details for debugging
    # print("\n\n")
    details_dict = {
        "article_type": "n/a",
        "clinical_study": "n/a",
        "written_language": "n/a"
    }

    for detail in details:
        if "article type" in detail.lower():
            details_dict["article_type"] = detail.split(':')[-1].strip()
        elif "clinical study" in detail.lower():
            details_dict["clinical_study"] = detail.split(':')[-1].strip().lower() in ["yes", "true"]
        elif "written language" in detail.lower():
            details_dict["written_language"] = detail.split(':')[-1].strip()

    return details_dict

# Function to print the output in tabular form and make selection decision
def print_results(paper_details_list):
    columns = ["Reference", "Title", "Abstract", "Publication Year", "Article Type", "Clinical Study", "Written Language", "Selected", "Reason"]
    data = []

    for details in paper_details_list:
        selected = True
        reason = []
        
        try:
            publication_year = int(details["publication_year"])
        except ValueError:
            publication_year = "n/a"

        if publication_year != "n/a" and publication_year <= 2017:
            selected = False
            reason.append("Published before 2018")
        
        if details["clinical_study"] != "n/a" and not details["clinical_study"]:
            selected = False
            reason.append("Not a clinical study")
        
        if details["written_language"] != "n/a" and details["written_language"].lower() != "english":
            selected = False
            reason.append("Not in English")
        
        if details["article_type"] != "n/a" and details["article_type"].lower() != "review article":
            selected = False
            reason.append("Not a review article")
        if details["article_type"] == "n/a" and details["written_language"] == "n/a" and details["clinical_study"] == "n/a":
            selected = False
            reason.append("No information retrieved")
        
        data.append([
            details["reference"],
            details["title"],
            details["abstract"],
            details["publication_year"],
            details["article_type"],
            "yes" if details["clinical_study"] == True else "no" if details["clinical_study"] == False else "n/a",
            details["written_language"],
            "Yes" if selected else "No",
            "-" if selected else ", ".join(reason)
        ])

    df = pd.DataFrame(data, columns=columns)
    return df

# Main processing
paper_details_list = []

for reference in paper_references:
    scraped_details = scrape_paper_details(reference)
    analyzed_details = analyze_paper_details(scraped_details['title'], scraped_details['abstract'])
    paper_details = {**scraped_details, **analyzed_details}
    paper_details_list.append(paper_details)

# Get the results as a dataframe
df = print_results(paper_details_list)

# Display the dataframe
pd.set_option('display.max_colwidth', 500)
df


Unnamed: 0,Reference,Title,Abstract,Publication Year,Article Type,Clinical Study,Written Language,Selected,Reason
0,"Noetel, Michael, et al. Effect of exercise for depression: systematic review and network meta-analysis of randomised controlled trials. bmj 384 (2024).",Effect of exercise for depression: systematic review and network meta-analysis of randomised controlled trials,"Objective To identify the optimal dose and modality of exercise for treating major depressive disorder, compared with psychotherapy, antidepressants, and control conditions. Design Systematic review and network meta-analysis. Methods Screening, data extraction, coding, and risk of bias assessment were performed independently and in duplicate. Bayesian arm based, multilevel network meta-analyses were performed for the primary analyses. Quality of the evidence for each arm was graded using the...",2024,review article,yes,English,Yes,-
1,"Brown, Jeremy P., et al. Quantifying possible bias in clinical and epidemiological studies with quantitative bias analysis: common approaches and limitations. bmj 385 (2024).",Quantifying possible bias in clinical and epidemiological studies with quantitative bias analysis: common approaches and limitations,"Bias in epidemiological studies can adversely affect the validity of study findings. Sensitivity analyses, known as quantitative bias analyses, are available to quantify potential residual bias arising from measurement error, confounding, and selection into the study. Effective application of these methods benefits from the input of multiple parties including clinicians, epidemiologists, and statisticians. This article provides an overview of a few common methods to facilitate both the use o...",2024,review article,yes,English,Yes,-
2,"Kauttonen J., Khan U.A., Aunimo L., Topic mining for thesis and job ads in ICT sector: can higher education institutes respond to job market demands, Frontiers in Education , Vol. 9, 2024",[HTML][HTML] Topic mining for theses and job ads in ICT sector: can higher education institutes respond to job market demands?,"Introduction This study aims to tackle the challenge of ensuring higher education students are equipped with high-demand skills for today's job market. The focus is on aligning the knowledge acquired during their studies, as represented by final-year thesis projects, with the skills and topics specified in actual job advertisements. Methods We developed a computational framework that uses automated subject indexing to extract representative skills and topics from two major datasets: thesis a...",2024,original research article,no,English,No,"Not a clinical study, Not a review article"
3,"Lupo, A., et al. Biomarqueurs prédictifs de l’immunothérapie anti-PD1/PD-L1 dans le cancer broncho-pulmonaire non à petites cellules. Revue de Pneumologie Clinique 74.5 (2018): 339-350.",Biomarkers predictive of PD1/PD-L1 immunotherapy in non-small cell lung cancer,"Immune checkpoint inhibitors (ICI), targeting the PD1/PD-L1 axis has shown their efficacy in lung cancer but only in a restricted population of patients, thus it is mandatory to identify biomarkers predicting the clinical benefit. In this article we will describe and analyzed biomarkers already published, from protein, to RNA and at last DNA markers, discussing each markers feasibility and interest. In the future, combined analysis of several markers will probably be proposed, particularly w...",2018,review article,yes,English,Yes,-
