In [6]:
#zero-shot extraction
import os
import pdfplumber
from langchain_openai import ChatOpenAI
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    Extract the following epidemiological data from the given text:
    - Reproduction number
    - Growth rate
    - Dispersion parameter
    - Pathogen Transmission
    - Zoonotic/primary infection
    - Secondary transmission
    - Case-fatality ratio (CFR)
    - Infection-fatality ratio (IFR)
    - Nosocomial infection
    - Incubation period
    - Generation time
    - Transmission chain
    - Seroprevalence
    - Susceptibles
    - Epidemic
    - Outbreak
    - Human birth rate(Bh)
    - Human natural mortality rate(μh)
    - Reciprocal of incubation period(ν)
    - Human recovery rate(γh)
    - Probability of an infectious human being asymptomatic(p)
    - Infection induced mortality in humans(μhI)
    - Recovery rate for rats(γr)
    - Natural mortality rate for rats(μr)
    - Rat-to-rat transmission rate(βrr)
    - Rat-to-human transmission rate(βrh)
    - Human-to-human transmission rate(βhh)
    - Time of minimum recruitment for rats(φ)
    - Shape parameter for recruitment function for rats(s)


    Text:
    {text}
    """

    response = llm(prompt)
    return response

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 显示提取的信息
    print(extracted_info)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'

main(pdf_path)

content='Here is the extracted epidemiological data from the provided text:\n\n- **Reproduction number (R0)**: Not explicitly stated, but the basic reproduction number for rats is indicated to be well above 1, suggesting high infectivity.\n- **Growth rate**: 7.14 × 10^-5 per capita per day.\n- **Dispersion parameter**: Not explicitly mentioned in the text.\n- **Pathogen Transmission**: Primarily zoonotic transmission from the rodent Mastomys natalensis to humans.\n- **Zoonotic/primary infection**: Yes, Lassa fever is a zoonotic disease with Mastomys natalensis as the primary reservoir.\n- **Secondary transmission**: Human-to-human transmission occurs but is less common and primarily in healthcare settings.\n- **Case-fatality ratio (CFR)**: 19.5% (derived from the proportion of deaths during the outbreak).\n- **Infection-fatality ratio (IFR)**: Not explicitly stated in the text.\n- **Nosocomial infection**: Yes, human-to-human transmission predominantly occurs in healthcare settings.\n-

In [4]:
#extract with table
import os
import pdfplumber
from langchain_openai import ChatOpenAI
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")

def extract_text_and_tables(pdf_path):
    combined_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 提取普通文本
            page_text = page.extract_text()
            if page_text:
                combined_text += page_text + "\n"
            
            # 提取表格内容并格式化
            tables = page.extract_tables()
            for table in tables:
                table_text = "\n".join(["\t".join([str(cell) if cell is not None else '' for cell in row]) for row in table if row])
                combined_text += table_text + "\n"
    
    return combined_text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    Extract the following epidemiological data from the given text:
    - Reproduction number
    - Growth rate
    - Dispersion parameter
    - Pathogen Transmission
    - Zoonotic/primary infection
    - Secondary transmission
    - Case-fatality ratio (CFR)
    - Infection-fatality ratio (IFR)
    - Nosocomial infection
    - Incubation period
    - Generation time
    - Transmission chain
    - Seroprevalence
    - Susceptibles
    - Epidemic
    - Outbreak
    - Human birth rate(Bh)
    - Human natural mortality rate(μh)
    - Reciprocal of incubation period(ν)
    - Human recovery rate(γh)
    - Probability of an infectious human being asymptomatic(p)
    - Infection induced mortality in humans(μhI)
    - Recovery rate for rats(γr)
    - Natural mortality rate for rats(μr)
    - Rat-to-rat transmission rate(βrr)
    - Rat-to-human transmission rate(βrh)
    - Human-to-human transmission rate(βhh)
    - Time of minimum recruitment for rats(φ)
    - Shape parameter for recruitment function for rats(s)

    Text:
    {text}
    """

    response = llm(prompt)
    return response

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_and_tables(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 显示提取的信息
    print(extracted_info)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'

main(pdf_path)

content="Based on the provided text, here is the extracted epidemiological data:\n\n- **Reproduction number (R0)**: Not explicitly stated, but the basic reproduction number for rats is indicated to be well above 1, suggesting high infectivity.\n- **Growth rate**: 7.14 × 10^-5 per capita per day (for Nigeria's population growth).\n- **Dispersion parameter**: Not explicitly mentioned in the text.\n- **Pathogen Transmission**: Predominantly zoonotic, with spillover events from the rodent reservoir (Mastomys natalensis) to humans.\n- **Zoonotic/primary infection**: Yes, Lassa fever is a zoonotic disease with Mastomys natalensis as the primary reservoir.\n- **Secondary transmission**: Occurs predominantly in healthcare settings through bodily fluids.\n- **Case-fatality ratio (CFR)**: 19.5% (calculated from the data provided).\n- **Infection-fatality ratio (IFR)**: Not explicitly stated in the text.\n- **Attack rate**: Not explicitly stated in the text.\n- **Symptom onset date**: Not explici

In [7]:
#zero-shot extraction with prompt engineering
import os
import pdfplumber
from langchain_openai import ChatOpenAI
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    I will ask you several questions regarding epidemiological data from the following text.
    Please extract the required data points. If a specific piece of data is not present in the text, simply respond with "Data not found". If you are unsure about the data, respond with "Not certain".
    
    For each piece of data, provide a confidence level (High, Medium, Low).

    Extract the following epidemiological data:
    - Reproduction number
    - Growth rate
    - Dispersion parameter
    - Pathogen Transmission
    - Zoonotic/primary infection
    - Secondary transmission
    - Case-fatality ratio (CFR)
    - Infection-fatality ratio (IFR)
    - Nosocomial infection
    - Incubation period
    - Generation time
    - Transmission chain
    - Seroprevalence
    - Susceptibles
    - Epidemic
    - Outbreak
    - Human birth rate(Bh)
    - Human natural mortality rate(μh)
    - Reciprocal of incubation period(ν)
    - Human recovery rate(γh)
    - Probability of an infectious human being asymptomatic(p)
    - Infection induced mortality in humans(μhI)
    - Recovery rate for rats(γr)
    - Natural mortality rate for rats(μr)
    - Rat-to-rat transmission rate(βrr)
    - Rat-to-human transmission rate(βrh)
    - Human-to-human transmission rate(βhh)
    - Time of minimum recruitment for rats(φ)
    - Shape parameter for recruitment function for rats(s)

    Text:
    {text}

    Format your response as:
    - [Data type]: [extracted data or "Data not found"]
    - Confidence level: [confidence level]
    """

    response = llm(prompt)
    return response

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 显示提取的信息
    print(extracted_info)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'

main(pdf_path)

content='- Reproduction number: Data not found\n- Confidence level: Not certain\n\n- Growth rate: 7.14×10−5 per capita per day\n- Confidence level: High\n\n- Dispersion parameter: Data not found\n- Confidence level: Data not found\n\n- Pathogen Transmission: Spillover from the zoonotic reservoir\n- Confidence level: High\n\n- Zoonotic/primary infection: Yes, primarily from Mastomys natalensis\n- Confidence level: High\n\n- Secondary transmission: Yes, but predominantly in healthcare settings\n- Confidence level: Medium\n\n- Case-fatality ratio (CFR): 19.5%\n- Confidence level: High\n\n- Infection-fatality ratio (IFR): Data not found\n- Confidence level: Data not found\n\n- Nosocomial infection: Yes, occurs predominantly in healthcare settings\n- Confidence level: Medium\n\n- Incubation period: 14 days\n- Confidence level: High\n\n- Generation time: Data not found\n- Confidence level: Data not found\n\n- Transmission chain: From infected rats to humans\n- Confidence level: High\n\n- Ser

In [12]:
#Extract with Langchain 
import os
import pdfplumber
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.chains import create_extraction_chain

# 设置你的OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

# 定义架构
schema = {
    "properties": {
        "Reproduction number": {"type": "string"},
        "Growth rate": {"type": "string"},
        "Dispersion parameter": {"type": "string"},
        "Pathogen Transmission": {"type": "string"},
        "Zoonotic/primary infection": {"type": "string"},
        "Secondary transmission": {"type": "string"},
        "Case-fatality ratio (CFR)": {"type": "string"},
        "Infection-fatality ratio (IFR)": {"type": "string"},
        "Nosocomial infection": {"type": "string"},
        "Incubation period": {"type": "string"},
        "Generation time": {"type": "string"},
        "Transmission chain": {"type": "string"},
        "Seroprevalence": {"type": "string"},
        "Susceptibles": {"type": "string"},
        "Epidemic": {"type": "string"},
        "Outbreak": {"type": "string"},
        "Human birth rate (Bh)": {"type": "string"},
        "Human natural mortality rate (μh)": {"type": "string"},
        "Reciprocal of incubation period (ν)": {"type": "string"},
        "Human recovery rate (γh)": {"type": "string"},
        "Probability of an infectious human being asymptomatic (p)": {"type": "string"},
        "Infection induced mortality in humans (μhI)": {"type": "string"},
        "Recovery rate for rats (γr)": {"type": "string"},
        "Natural mortality rate for rats (μr)": {"type": "string"},
        "Rat-to-rat transmission rate (βrr)": {"type": "string"},
        "Rat-to-human transmission rate (βrh)": {"type": "string"},
        "Human-to-human transmission rate (βhh)": {"type": "string"},
        "Time of minimum recruitment for rats (φ)": {"type": "string"},
        "Shape parameter for recruitment function for rats (s)": {"type": "string"}
    }
}

# 创建信息提取链
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
chain = create_extraction_chain(schema, llm)

# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 提取信息并创建数据框
def extract_info_from_text(text):
    response = chain.run(text)
    
    # 打印模型的响应
    print("Raw response from model:", response)    
    extracted_data = response[0]
    return pd.Series(extracted_data)

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 提取信息
    extracted_series = extract_info_from_text(text)
    
    # 创建数据框
    df = pd.DataFrame([extracted_series])
    
    # 显示数据框
    print(df)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'
main(pdf_path)

Raw response from model: [{'Reproduction number': 'R0', 'Growth rate': '7.14×10−5 per capita per day', 'Human birth rate (Bh)': '1.2×10−4 day−1', 'Human natural mortality rate (μh)': '1 day−1', 'Reciprocal of incubation period (ν)': '0.1 day−1', 'Human recovery rate (γh)': '0.1 day−1', 'Probability of an infectious human being asymptomatic (p)': '0.8', 'Infection induced mortality in humans (μhI)': '0.0242 day−1', 'Recovery rate for rats (γr)': '1/90 day−1', 'Natural mortality rate for rats (μr)': '0.0038 day−1', 'Rat-to-rat transmission rate (βrr)': 'LogNormal(−1.03,1)', 'Rat-to-human transmission rate (βrh)': 'LogNormal(0.347,1.5)', 'Human-to-human transmission rate (βhh)': 'LogNormal(−2.35,0.5)', 'Time of minimum recruitment for rats (φ)': 'Uniform(0,1)', 'Shape parameter for recruitment function for rats (s)': 'LogNormal(3,1)'}]
  Reproduction number                   Growth rate Human birth rate (Bh)  \
0                  R0  7.14×10−5 per capita per day        1.2×10−4 day−1   



In [13]:
#Extract with Langchain and prompt engineering
import os
import pdfplumber
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.chains import create_extraction_chain

# 设置你的OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

# 定义架构
schema = {
    "properties": {
        "Reproduction number": {"type": "string"},
        "Growth rate": {"type": "string"},
        "Dispersion parameter": {"type": "string"},
        "Pathogen Transmission": {"type": "string"},
        "Zoonotic/primary infection": {"type": "string"},
        "Secondary transmission": {"type": "string"},
        "Case-fatality ratio (CFR)": {"type": "string"},
        "Infection-fatality ratio (IFR)": {"type": "string"},
        "Nosocomial infection": {"type": "string"},
        "Incubation period": {"type": "string"},
        "Generation time": {"type": "string"},
        "Transmission chain": {"type": "string"},
        "Seroprevalence": {"type": "string"},
        "Susceptibles": {"type": "string"},
        "Epidemic": {"type": "string"},
        "Outbreak": {"type": "string"},
        "Human birth rate (Bh)": {"type": "string"},
        "Human natural mortality rate (μh)": {"type": "string"},
        "Reciprocal of incubation period (ν)": {"type": "string"},
        "Human recovery rate (γh)": {"type": "string"},
        "Probability of an infectious human being asymptomatic (p)": {"type": "string"},
        "Infection induced mortality in humans (μhI)": {"type": "string"},
        "Recovery rate for rats (γr)": {"type": "string"},
        "Natural mortality rate for rats (μr)": {"type": "string"},
        "Rat-to-rat transmission rate (βrr)": {"type": "string"},
        "Rat-to-human transmission rate (βrh)": {"type": "string"},
        "Human-to-human transmission rate (βhh)": {"type": "string"},
        "Time of minimum recruitment for rats (φ)": {"type": "string"},
        "Shape parameter for recruitment function for rats (s)": {"type": "string"}
    }
}

# 创建信息提取链
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
chain = create_extraction_chain(schema, llm)

# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# 提取信息并创建数据框
def extract_info_from_text(text):
    # 定义自定义提示词
    custom_prompt = (
        "Extract the following information from the text below:\n"
        "Reproduction number: {Reproduction number}\n"
        "Growth rate: {Growth rate}\n"
        "Dispersion parameter: {Dispersion parameter}\n"
        "Pathogen Transmission: {Pathogen Transmission}\n"
        "Zoonotic/primary infection: {Zoonotic/primary infection}\n"
        "Secondary transmission: {Secondary transmission}\n"
        "Case-fatality ratio (CFR): {Case-fatality ratio (CFR)}\n"
        "Infection-fatality ratio (IFR): {Infection-fatality ratio (IFR)}\n"
        "Nosocomial infection: {Nosocomial infection}\n"
        "Incubation period: {Incubation period}\n"
        "Generation time: {Generation time}\n"
        "Transmission chain: {Transmission chain}\n"
        "Seroprevalence: {Seroprevalence}\n"
        "Susceptibles: {Susceptibles}\n"
        "Epidemic: {Epidemic}\n"
        "Outbreak: {Outbreak}\n"
        "Human birth rate (Bh): {Human birth rate (Bh)}\n"
        "Human natural mortality rate (μh): {Human natural mortality rate (μh)}\n"
        "Reciprocal of incubation period (ν): {Reciprocal of incubation period (ν)}\n"
        "Human recovery rate (γh): {Human recovery rate (γh)}\n"
        "Probability of an infectious human being asymptomatic (p): {Probability of an infectious human being asymptomatic (p)}\n"
        "Infection induced mortality in humans (μhI): {Infection induced mortality in humans (μhI)}\n"
        "Recovery rate for rats (γr): {Recovery rate for rats (γr)}\n"
        "Natural mortality rate for rats (μr): {Natural mortality rate for rats (μr)}\n"
        "Rat-to-rat transmission rate (βrr): {Rat-to-rat transmission rate (βrr)}\n"
        "Rat-to-human transmission rate (βrh): {Rat-to-human transmission rate (βrh)}\n"
        "Human-to-human transmission rate (βhh): {Human-to-human transmission rate (βhh)}\n"
        "Time of minimum recruitment for rats (φ): {Time of minimum recruitment for rats (φ)}\n"
        "Shape parameter for recruitment function for rats (s): {Shape parameter for recruitment function for rats (s)}\n"
        "\nText:"+text
    )
    
    response = chain.run(custom_prompt)
    
    # 打印模型的响应
    print("Raw response from model:", response)    
    extracted_data = response[0]
    return pd.Series(extracted_data)


# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 提取信息
    extracted_series = extract_info_from_text(text)
    
    # 创建数据框
    df = pd.DataFrame([extracted_series])
    
    # 显示数据框
    print(df)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'
main(pdf_path)

Raw response from model: [{'Reproduction number': 'R0', 'Growth rate': '7.14×10−5 per capita per day', 'Dispersion parameter': 's', 'Pathogen Transmission': 'spillover from the zoonotic reservoir', 'Zoonotic/primary infection': 'Lassa fever (Lf) is a viral zoonotic disease', 'Secondary transmission': 'human-to-human transmission occurs predominantly in healthcare settings', 'Case-fatality ratio (CFR)': '19.5%', 'Infection-fatality ratio (IFR)': 'not specified', 'Nosocomial infection': 'occurs predominantly in healthcare settings', 'Incubation period': '14 days', 'Generation time': 'not specified', 'Transmission chain': 'from infected rats to humans', 'Seroprevalence': '58.2% of residents were seropositive', 'Susceptibles': 'susceptible humans and rats', 'Epidemic': 'annual outbreaks occur from December to March', 'Outbreak': 'epidemics since 2018 being notably more severe', 'Human birth rate (Bh)': '1.2×10−4 day−1', 'Human natural mortality rate (μh)': '1 day−1', 'Reciprocal of incubat

In [14]:
#Pydantic
import os
import pdfplumber
import pandas as pd
from typing import Optional
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, ValidationError

# 设置你的OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

# Pydantic model for epidemiological data
class EpidemiologicalData(BaseModel):
    reproduction_number: Optional[str]
    growth_rate: Optional[str]
    dispersion_parameter: Optional[str]
    pathogen_transmission: Optional[str]
    zoonotic_primary_infection: Optional[str]
    secondary_transmission: Optional[str]
    case_fatality_ratio_cfr: Optional[str]
    infection_fatality_ratio_ifr: Optional[str]
    nosocomial_infection: Optional[str]
    incubation_period: Optional[str]
    generation_time: Optional[str]
    transmission_chain: Optional[str]
    seroprevalence: Optional[str]
    susceptibles: Optional[str]
    epidemic: Optional[str]
    outbreak: Optional[str]
    human_birth_ratebh: Optional[str]
    human_natural_mortality_ratemuh: Optional[str]
    reciprocal_of_incubation_periodnu: Optional[str]
    human_recovery_rategammah: Optional[str]
    probability_of_an_infectious_human_being_asymptomaticp: Optional[str]
    infection_induced_mortality_in_humansmuhi: Optional[str]
    recovery_rate_for_ratsgammar: Optional[str]
    natural_mortality_rate_for_ratsmur: Optional[str]
    rat_to_rat_transmission_ratebetarr: Optional[str]
    rat_to_human_transmission_ratebetarh: Optional[str]
    human_to_human_transmission_ratebetahh: Optional[str]
    time_of_minimum_recruitment_for_ratsphi: Optional[str]
    shape_parameter_for_recruitment_function_for_ratss: Optional[str]


# Set up a Pydantic parser and prompt template
parser = PydanticOutputParser(pydantic_object=EpidemiologicalData)
prompt = PromptTemplate(
    template="Extract the following epidemiological data from the given text:\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    model = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    _input = prompt.format(query=text)
    response = model(_input)
    
    # 调试输出
    print("Type of response:", type(response))
    print("Response from model:", response)
    
    try:
        response_text = response.content  # 直接访问AIMessage对象的内容
        parsed_response = parser.parse(response_text)
        return parsed_response.model_dump()
    except ValidationError as e:
        print("Validation error:", e)
        return {}
    except Exception as e:
        print("Error:", e)
        return {}

# 处理PDF文件并提取数据
def process_pdf(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 将提取的信息转换为DataFrame
    flat_data = {key: [value] for key, value in extracted_info.items()}
    df_epi = pd.DataFrame(flat_data)
    
    return df_epi

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'
intel_df = process_pdf(pdf_path)

# 显示提取的信息
print(intel_df.head())


Type of response: <class 'langchain_core.messages.ai.AIMessage'>
Response from model: content='```json\n{\n  "reproduction_number": "R0 > 1",\n  "growth_rate": "7.14×10−5 per capita per day",\n  "dispersion_parameter": null,\n  "pathogen_transmission": "spillover events from the zoonotic reservoir",\n  "zoonotic_primary_infection": "Mastomys natalensis",\n  "secondary_transmission": "low levels of human-to-human transmission",\n  "case_fatality_ratio_cfr": "19.5%",\n  "infection_fatality_ratio_ifr": null,\n  "nosocomial_infection": "predominantly in healthcare settings",\n  "incubation_period": "14 days",\n  "generation_time": null,\n  "transmission_chain": "rat-to-human transmission",\n  "seroprevalence": "58.2% of residents were seropositive",\n  "susceptibles": "susceptible human population",\n  "epidemic": "annual outbreaks from December to March",\n  "outbreak": "epidemics since 2018",\n  "human_birth_ratebh": "1.2×10−4 day−1",\n  "human_natural_mortality_ratemuh": "1 day−1",\n  "

In [16]:
#Pydantic with prompt engineering
import os
import pdfplumber
import pandas as pd
from typing import Optional
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, ValidationError

# 设置你的OpenAI API Key
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    raise ValueError("Please set the OPENAI_API_KEY environment variable")

# Pydantic model for epidemiological data
class EpidemiologicalData(BaseModel):
    reproduction_number: Optional[str]
    growth_rate: Optional[str]
    dispersion_parameter: Optional[str]
    pathogen_transmission: Optional[str]
    zoonotic_primary_infection: Optional[str]
    secondary_transmission: Optional[str]
    case_fatality_ratio_cfr: Optional[str]
    infection_fatality_ratio_ifr: Optional[str]
    nosocomial_infection: Optional[str]
    incubation_period: Optional[str]
    generation_time: Optional[str]
    transmission_chain: Optional[str]
    seroprevalence: Optional[str]
    susceptibles: Optional[str]
    epidemic: Optional[str]
    outbreak: Optional[str]
    human_birth_ratebh: Optional[str]
    human_natural_mortality_ratemuh: Optional[str]
    reciprocal_of_incubation_periodnu: Optional[str]
    human_recovery_rategammah: Optional[str]
    probability_of_an_infectious_human_being_asymptomaticp: Optional[str]
    infection_induced_mortality_in_humansmuhi: Optional[str]
    recovery_rate_for_ratsgammar: Optional[str]
    natural_mortality_rate_for_ratsmur: Optional[str]
    rat_to_rat_transmission_ratebetarr: Optional[str]
    rat_to_human_transmission_ratebetarh: Optional[str]
    human_to_human_transmission_ratebetahh: Optional[str]
    time_of_minimum_recruitment_for_ratsphi: Optional[str]
    shape_parameter_for_recruitment_function_for_ratss: Optional[str]

# Set up a Pydantic parser and prompt template
parser = PydanticOutputParser(pydantic_object=EpidemiologicalData)
prompt = PromptTemplate(
    template="""
    You are an expert in epidemiology and are tasked with extracting key epidemiological data from a given text.
    Please extract the relevant epidemiological parameters as specified by the following schema.
    If any data is not found or if you are uncertain, indicate with "Data not found" or "Not certain". 
    Provide each data point along with your confidence level. Make sure response for all of the parameters.

    Text:
    {query}

    Format your response as a JSON object matching the schema below:
    {format_instructions}
    """,
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# 从PDF文件中提取文本
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if page.extract_text():
                text += page.extract_text() + "\n"
    return text

#提取数据
def extract_info_with_prompting(text):
    model = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    _input = prompt.format(query=text)
    response = model(_input)
    
    # 调试输出
    print("Type of response:", type(response))
    print("Response from model:", response)
    
    try:
        response_text = response.content  # 直接访问AIMessage对象的内容
        parsed_response = parser.parse(response_text)
        return parsed_response.model_dump()
    except ValidationError as e:
        print("Validation error:", e)
        return {}
    except Exception as e:
        print("Error:", e)
        return {}
    
# 处理PDF文件并提取数据
def process_pdf(pdf_path):
    # 提取PDF文本
    text = extract_text_from_pdf(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 将提取的信息转换为DataFrame
    flat_data = {key: [value] for key, value in extracted_info.items()}
    df_epi = pd.DataFrame(flat_data)
    
    return df_epi

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'
intel_df = process_pdf(pdf_path)

# 显示提取的信息
print(intel_df)

Type of response: <class 'langchain_core.messages.ai.AIMessage'>
Response from model: content='```json\n{\n  "reproduction_number": "Not certain",\n  "growth_rate": "7.14×10−5 per capita per day",\n  "dispersion_parameter": "Not certain",\n  "pathogen_transmission": "Zoonotic spillover from Mastomys natalensis to humans",\n  "zoonotic_primary_infection": "80% of infections are suspected to be spillover events",\n  "secondary_transmission": "Not certain",\n  "case_fatality_ratio_cfr": "19.5%",\n  "infection_fatality_ratio_ifr": "Not certain",\n  "nosocomial_infection": "Transmission occurs predominantly in healthcare settings",\n  "incubation_period": "14 days",\n  "generation_time": "Not certain",\n  "transmission_chain": "Rodent to human",\n  "seroprevalence": "Not certain",\n  "susceptibles": "Not certain",\n  "epidemic": "Annual outbreaks from December to March",\n  "outbreak": "2018 epidemic lasted from January to April",\n  "human_birth_ratebh": "1.2×10−4 day−1",\n  "human_natural

In [18]:
#Extract from table
import os
import pdfplumber
from langchain_openai import ChatOpenAI
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")

def extract_text_and_tables(pdf_path):
    combined_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
           
            
            # 提取表格内容并格式化
            tables = page.extract_tables()
            for table in tables:
                table_text = "\n".join(["\t".join([str(cell) if cell is not None else '' for cell in row]) for row in table if row])
                combined_text += table_text + "\n"
    
    return combined_text

# 使用模型直接提取信息
def extract_info_with_prompting(text):
    llm = ChatOpenAI(temperature=0, model="gpt-4o-mini", openai_api_key=api_key)
    
    prompt = f"""
    Extract the following epidemiological data from the given text:
    - Reproduction number
    - Growth rate
    - Dispersion parameter
    - Pathogen Transmission
    - Zoonotic/primary infection
    - Secondary transmission
    - Case-fatality ratio (CFR)
    - Infection-fatality ratio (IFR)
    - Nosocomial infection
    - Incubation period
    - Generation time
    - Transmission chain
    - Seroprevalence
    - Susceptibles
    - Epidemic
    - Outbreak
    - Human birth rate(Bh)
    - Human natural mortality rate(μh)
    - Reciprocal of incubation period(ν)
    - Human recovery rate(γh)
    - Probability of an infectious human being asymptomatic(p)
    - Infection induced mortality in humans(μhI)
    - Recovery rate for rats(γr)
    - Natural mortality rate for rats(μr)
    - Rat-to-rat transmission rate(βrr)
    - Rat-to-human transmission rate(βrh)
    - Human-to-human transmission rate(βhh)
    - Time of minimum recruitment for rats(φ)
    - Shape parameter for recruitment function for rats(s)
    Text:
    {text}
    """

    response = llm(prompt)
    return response

# 主函数
def main(pdf_path):
    # 提取PDF文本
    text = extract_text_and_tables(pdf_path)
    
    # 使用prompting提取信息
    extracted_info = extract_info_with_prompting(text)
    
    # 显示提取的信息
    print(extracted_info)

# 调用主函数，传入PDF文件路径
pdf_path = 'journal.pntd.0011543.pdf'

main(pdf_path)

content='Based on the provided text, here is the extracted epidemiological data:\n\n- **Reproduction number**: 2\n- **Growth rate**: 0.1 day⁻¹\n- **Dispersion parameter**: Not explicitly provided\n- **Pathogen Transmission**: Not explicitly provided\n- **Zoonotic/primary infection**: Not explicitly provided\n- **Secondary transmission**: Not explicitly provided\n- **Case-fatality ratio (CFR)**: Not explicitly provided\n- **Infection-fatality ratio (IFR)**: Not explicitly provided\n- **Nosocomial infection**: Not explicitly provided\n- **Incubation period**: 90 days\n- **Generation time**: 54 × 365 days\n- **Transmission chain**: Not explicitly provided\n- **Seroprevalence**: Not explicitly provided\n- **Susceptibles**: Not explicitly provided\n- **Epidemic**: Not explicitly provided\n- **Outbreak**: Not explicitly provided\n- **Human birth rate (Bh)**: Not explicitly provided\n- **Human natural mortality rate (μh)**: Not explicitly provided\n- **Reciprocal of incubation period (ν)**: 1