## OpenAI Research Helper

In [3]:
#!pip install pypdf

In [4]:
import os
import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

MODEL_NAME = 'gemini/gemini-2.0-flash-exp'

openai_client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

In [5]:
import re
from pypdf import PdfReader


def extract_text_from_pdf(filename):
    """
    Extracts and returns the concatenated text from all pages of a given PDF file.

    Args:
        filename (str): The path to the PDF file.

    Returns:
        str: The concatenated text of the PDF.
    """
    try:
        reader = PdfReader(filename)
        return ''.join(page.extract_text() for page in reader.pages)
    except Exception as e:
        print(f"An error occurred while reading the PDF: {e}")
        return ""



def get_completion(prompt, system='', history=[]):
    try:
        user_question = re.search("<question>([\s\S]*?)<\/question>", prompt).group(1)
    except:
        user_question = prompt

    # Prepare the conversation history for OpenAI API
    messages = [{"role": "system", "content": system}] if system else []
    messages.append({"role": "user", "content": prompt})

    # Call OpenAI API
    response = openai_client.chat.completions.create(
        model=MODEL_NAME,  # Specify the model to use
        messages=messages,  # The conversation history
        temperature=0.0, # make sure the llm output in deterministic way
    )

    # Extract the assistant's response
    response_text = response.choices[0].message.content

    # Update history
    history.extend([f"question: {user_question}", f"answer: {response_text}"])
    history_msg = "</n>".join(history)

    return response_text, history_msg


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [6]:
RESEARCH_TOPIC = "stroke prediction"

PROMPT = """
You will be acting as an academmic researcher. Your goal is to study and find insights from the research papers.

You should maintain a professional tone when writing your response.

Here is the research paper:
<document>
{RESEARCH_PAPER}
</document>

Please go through the paper carefully and fill out all of these components to the best of your ability based on the information provided. Let me know if any component is not applicable or cannot be found.

Here is the conversation history (between the user and you) prior to the question. It could be empty if there is no history.
<history>{HISTORY}</history>

Here is the user's question: 
<question>{QUESTION}</question>

Think about your answer first before you respond. Put your response in <response></response> tags.
"""
HISTORY = []

QUESTION = f"""
I need you to carefully read through the attached research paper and extract the key details related to the following components:

Here is the list of components to extract from the research paper:

- "author_name": "Name(s) of the author(s)."
-"title": "Title of the research paper."
- "journal": "Name of the journal where the paper is published." 
- "data_sources": "Data sources used in the research (if any)."
- "sample": "Characteristics of the sample (if any)." ,
- "independent_variables": "Independent variable(s)." ,
- "dependent_variables": "Dependent variable(s)." ,
- "factors_affecting_topic": "Factors affecting the research topic, with relevant statistics if applicable." ,
- "data_balancing_techniques": "Data balancing techniques used (if mentioned)." ,
- "check1_data_balancing_techniques": "Check if the data balancing is applied to y_test? ",
- "check2_data_balancing_techniques": "Check if the data balancing is applied before or after train test split ",
- "missing_value_imputation": "Missing value imputation techniques (if any)." ,
- "outlier_handling": "Techniques used to handle outliers (if any)." ,
- "prediction_models": "Prediction models used in the study." ,
- "innovative_methods": "Innovative methods like machine learning replacing traditional techniques." ,
- "feature_importance_insights": "Insights from feature importance in modeling risk factors." ,
- "assumed_risk_factors": "Assumed risk factors before modeling." ,
- "explainability_implementation": "Implementation for explainable or interpretable machine learning." ,
- "research_objectives": "Research objectives." ,
- "research_design": "Research design and methodology." ,
- "key_findings": "Key research findings or outcomes." ,
- "future_implications": "Implications for future research." ,
- "performance_metrics": "Performance metrics of the prediction models." ,
- "research_gaps": "Identified research gaps." ,
- "related_works": "Related works mentioned in the paper." 

"""


In [7]:
from pathlib import Path

current_dir = Path.cwd()

pdf_files = current_dir.glob("*.pdf")  # Match all PDF files in the directory

for pdf_file in pdf_files:
    research_paper = extract_text_from_pdf(pdf_file)

    # Assume `get_completion` is defined elsewhere
    completion, history_msg = get_completion(PROMPT.format(
        RESEARCH_PAPER=research_paper, HISTORY=[], QUESTION=QUESTION))
    
    # print(f"Processing: {pdf_file}")
    # print(completion)
    
    # Write the completion to a text file
    output_file = pdf_file.with_suffix(".txt")
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"{pdf_file}\n==================================\n{completion}")