In [45]:
import os
import pdfplumber
from langchain_openai import ChatOpenAI
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    Extracts the captions of all images in the text, often contain "Fig", 
    which are often a paragraph. If there are only captions, then only the captions are extracted.
    Text:
    {text}
    """

    response = llm(prompt)
    return response

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 显示提取的信息
    print(extracted_info)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'

main(pdf_path)

content='Here are the extracted captions from the text:\n\n1. **Fig 1.** Weekly confirmed case data for Lassa fever in Nigeria between the weeks ending 7th January 2018 until 12th July 2020.  \n   [Link to figure](https://doi.org/10.1371/journal.pntd.0011543.g001)\n\n2. **Fig 2.** Model flow chart of the transmission and population dynamics of the system of Eq 3. Blue solid arrows denote recruitment. Black solid arrows denote progression of the disease. Red dashed arrows denote disease transmission. Purple solid arrows denote mortalities. Parameters are detailed in full in Table 2 where λ and λ are defined in Eq 2 (i) and (ii) respectively, and B(t) is defined in Eq 1.  \n   [Link to figure](https://doi.org/10.1371/journal.pntd.0011543.g002)\n\n3. **Fig 3.** The epidemiological model captured 3 consecutive Lf epidemics in Nigeria. The simulated cases compared with the observed data. In orange is the 90% range of values I takes in the final generation at each time point; the median valu

In [27]:
import os
import pdfplumber
import fitz  # PyMuPDF
import base64
from langchain_openai import ChatOpenAI

api_key = os.getenv("OPENAI_API_KEY")

# 提取PDF中的图片并将其编码为Base64
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    image_info = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        images = page.get_images(full=True)
        
        for img_index, img in enumerate(images):
            xref = img[0]  # 图片的引用
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # 使用base64编码
            image_base64 = base64.b64encode(image_bytes).decode("utf-8")
            
            # 记录图片信息
            image_info.append({
                "page_num": page_num + 1,
                "image_base64": image_base64,
                "image_extension": base_image["ext"]
            })
    
    return image_info

# 提取PDF文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 使用模型提取图片说明
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    Extracts the captions of all images in the text, often contain "Fig", 
    which are often a paragraph. If there are only captions, then only the captions are extracted.
    {text}
    """
    
    # 使用 generate 方法生成响应
    response = llm.generate([prompt])

    # GPT的响应返回的是一个包含多个段落的说明
    return response.generations[0][0].text.split("\n\n")  # 每个段落作为一个说明

# 将图像和说明保存在变量中
def get_images_and_captions(images_info, extracted_info):
    data = []
    for idx, image in enumerate(images_info):
        # 如果说明的数量少于图片数量，给出默认说明
        caption = extracted_info[idx] if idx < len(extracted_info) else "No caption found"
        
        # 构建保存的数据
        data.append({
            "image_base64": image["image_base64"],
            "image_extension": image["image_extension"],
            "caption": caption
        })
    
    return data

# 主函数：提取文本和图片并进行匹配
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 提取PDF中的图片
    images_info = extract_images_from_pdf(pdf_path)
    
    # 使用prompting提取图片说明
    extracted_info = extract_info_with_prompting(text)
    
    # 将图片和说明保存在变量中
    images_and_captions = get_images_and_captions(images_info, extracted_info)
    
    return images_and_captions

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'
images_and_captions = main(pdf_path)

# 可以打印出变量或者在之后使用



In [63]:
# 输出 images_and_captions 中第一个图片的信息
first_image_info = images_and_captions[1]

# 打印第一个图片的Base64编码（仅显示前100个字符）
print(f"Image Base64 (truncated): {first_image_info['image_base64'][:100]}...")

# 打印第一个图片的扩展名
print(f"Image Extension: {first_image_info['image_extension']}")

# 打印第一个图片的描述
print(f"Caption: {first_image_info['caption']}")


Image Base64 (truncated): /9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMU...
Image Extension: jpeg
Caption: 1. **Fig 1.** Weekly confirmed case data for Lassa fever in Nigeria between the weeks ending 7th January 2018 until 12th July 2020.  
   [https://doi.org/10.1371/journal.pntd.0011543.g001]


In [64]:
import requests
import time

# OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

# OpenAI API headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# 构建消息payload，准备批次中的每张图片和描述
def construct_payload(images_and_captions):
    messages = []
    for item in images_and_captions:
        base64_image = item["image_base64"]
        image_extension = item["image_extension"]
        caption = item["caption"]
        
        # Append the image (in Base64) and the description to the messages
        message = {
            "role": "user",
            "content": f"Here is a base64 encoded image ({image_extension}). Please describe in detail the epidemiological parameters of the figure in relation to the caption of the image: data:image/{image_extension};base64,{base64_image}. The provided caption is: {caption}"
        }
        messages.append(message)
    
    return messages

# 分批发送图片和描述
def send_images_in_batches(images_and_captions, batch_size=1, wait_time = 10):
    for i in range(0, len(images_and_captions), batch_size):
        # 提取当前批次的图片
        batch = images_and_captions[i:i + batch_size]
        
        # 构建 payload
        messages = construct_payload(batch)
        
        payload = {
            "model": "gpt-4o",
            "messages": messages,
            "max_tokens": 300
        }

        # 发送请求
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        
        # 打印或处理返回值
        print(f"Batch {i//batch_size + 1} Response:", response.json())
        
         # 等待指定时间
        time.sleep(wait_time)

# 调用批量发送函数
send_images_in_batches(images_and_captions, batch_size=1)  


Batch 1 Response: {'id': 'chatcmpl-A5EUBTpVIpiHWw6MKUYWAHiBkJHtV', 'object': 'chat.completion', 'created': 1725810687, 'model': 'gpt-4o-2024-05-13', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "I'm unable to view or analyze images directly from raw Base64 text. However, I can help you understand epidemiological parameters and how they are often represented in figures.\n\nEpidemiological parameters can include a variety of metrics and concepts such as:\n\n1. **Incidence Rate**: The number of new cases of a disease in a specific population during a certain time period.\n2. **Prevalence**: The total number of cases, both new and pre-existing, of a disease in a specific population at a particular time.\n3. **Mortality Rate**: The number of deaths due to a specific disease in a given population, usually expressed per 1000 or 100,000 people.\n4. **Case Fatality Rate**: The proportion of individuals diagnosed with a disease who die from that disease over a certain per

In [67]:
import requests
import time

# OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

# OpenAI API headers
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# 构建消息payload，准备批次中的每张图片和描述
def construct_payload(images_and_captions):
    messages = []
    for item in images_and_captions:
        base64_image = item["image_base64"]
        image_extension = item["image_extension"]
        caption = item["caption"]
        
        # Append the image (in Base64) and the description to the messages
        message = {
            "role": "user",
            "content": [
        {
          "type": "text",
          "text": f"Please describe in detail the epidemiological parameters of the figure in relation to the caption of the image, explain in detail how the value of each parameter changes in the image, don't just explain the meaning of the parameter, the provided caption is: {caption}"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": f"data:image/jpeg;base64,{base64_image}"
          }
        }
      ]
        }
        messages.append(message)
    
    return messages

# 分批发送图片和描述
def send_images_in_batches(images_and_captions, batch_size=1, wait_time = 10):
    for i in range(0, len(images_and_captions), batch_size):
        # 提取当前批次的图片
        batch = images_and_captions[i:i + batch_size]
        
        # 构建 payload
        messages = construct_payload(batch)
        
        payload = {
            "model": "gpt-4o-mini",
            "messages": messages,
            "max_tokens": 800
        }

        # 发送请求
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        
        # 打印或处理返回值
        print(f"Batch {i//batch_size + 1} Response:", response.json())
        
         # 等待指定时间（例如10秒）
        time.sleep(wait_time)

# 调用批量发送函数
send_images_in_batches(images_and_captions, batch_size=1)  

Batch 1 Response: {'id': 'chatcmpl-A5F6oGjf3Ev6ngiHv4hGOubIwNipj', 'object': 'chat.completion', 'created': 1725813082, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "I'm unable to view or interpret images directly. However, I can help explain various epidemiological parameters if you provide specific details from the figure and the caption. Typical epidemiological parameters might include:\n\n1. **Incidence**: The rate of new cases of a disease in a population over a specific time period. Changes in incidence can indicate outbreaks or improvements in public health measures.\n\n2. **Prevalence**: The total number of cases, both new and existing, in a population at a given time. Increases or decreases in prevalence may reflect effective interventions or changes in population health.\n\n3. **Mortality Rate**: The number of deaths due to a disease in a population over a certain period. A rising mortality rate can signal worsening di

Batch 4 Response: {'id': 'chatcmpl-A5F7e7tsKQmMnU5mSVQ9RDy93Xe0k', 'object': 'chat.completion', 'created': 1725813134, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "To analyze the epidemiological parameters from the provided figure in relation to the caption about Lf (Lymphatic Filariasis) epidemics in Nigeria, we can break down the parameters given in the histograms along with their relevance to the model and observed data. Here’s a detailed explanation of each parameter.\n\n### 1. **s (Susceptible Population)**\n- **Description**: This parameter represents the susceptible individuals in the population who can contract Lf.\n- **Change in Value**: In the histogram, we see a left-skewed distribution with a gradually decreasing value, indicating that as time progresses, the number of susceptible individuals decreases due to either infection or recovery. The peak around the middle suggests a significant population that is still su

Batch 6 Response: {'id': 'chatcmpl-A5F8CiDALdm5Iku4yfxNW91qItmDQ', 'object': 'chat.completion', 'created': 1725813168, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': "The figure referenced in the caption depicts the dynamics of different compartments of the population of *Mastomys natalensis* rats over a specific time period, focusing on the relationships between susceptible, infected, and recovered individuals. Here’s a detailed breakdown of the epidemiological parameters indicated in the figure, along with their changes over time:\n\n### Parameters Explained\n\n1. **Susceptible (Sᵣ)**:\n   - **Representation**: Red line in the figure.\n   - **Description**: This compartment represents the proportion of rats that are susceptible to infection.\n   - **Trends**:\n     - Initially, the proportion of susceptible rats is high, indicating a significant number of individuals that can contract the infection.\n     - As the infection spr

In [None]:
在提取图片标题和说明时需要近一步设计，这里的方法仅适用于这篇文章
在提取图片后，需要引入深度学习的方法来判断图片是否包含有意义的数据，如何自动化？
标题和图片进行匹配时，如何保证匹配正确？
传输请求时图片过大，可以用网上托管并传输链接解决
图片本质代表的往往是参数的变化，抽取后如何进行结构化的储存？