In [13]:
import fitz  # PyMuPDF
import json
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

In [14]:
# Initialize Groq Chat Model
chat = ChatGroq(
    temperature=0,
    model="llama3-70b-8192",
    api_key="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"
)

In [15]:
# Define the prompt for extracting references
system_prompt = "You are a helpful assistant that extracts references from provided text and returns them in the exact format they appear."
human_prompt_template = """
{text}
Extract the references from the text above and ensure they are in the same format as the original document. Do not include any introductory phrases or unrelated content. Only return the references.
"""

prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", human_prompt_template)])

chain = prompt | chat

In [16]:
# Function to extract text from a PDF page
def extract_text_from_page(page):
    text = page.get_text("text")
    return text

In [17]:
# Function to filter and clean the extracted references
def clean_references(raw_references):
    cleaned_references = []
    unwanted_phrases = [
        "Here are the extracted references in the same format as the original document:",
        "Here are the extracted references:",
        ""
    ]
    for reference in raw_references:
        if reference.strip() not in unwanted_phrases:
            cleaned_references.append(reference.strip())
    return cleaned_references

In [18]:
def find_duplicate_pages(pdf_document):
    pages_content = {}
    duplicate_pages = {}

    for page_num in range(pdf_document.page_count):
        text = extract_text_from_page(pdf_document.load_page(page_num))
        if text in pages_content:
            duplicate_pages[page_num] = pages_content[text]
        else:
            pages_content[text] = page_num

    return duplicate_pages

In [21]:
with open("pdf_pages_info.json", "r") as json_file:
        pdf_pages_info = json.load(json_file)

# Create the output directory if it doesn't exist
output_dir = "out"
os.makedirs(output_dir, exist_ok=True)

for pdf_name, pages_input in pdf_pages_info.items():
    pdf_path = f"sample-pdfs2/{pdf_name}"

    # Open the PDF to get the total number of pages
    pdf_document = fitz.open(pdf_path)
    total_pages = pdf_document.page_count
    print(f"The document '{pdf_name}' has {total_pages} pages.")

    duplicate_pages = find_duplicate_pages(pdf_document)

    pages = [int(x) - 1 for x in pages_input]
    all_text = ""
    for page_num in pages:
        if page_num in duplicate_pages:
            page_num = duplicate_pages[page_num]
        if 0 <= page_num < total_pages:
            page = pdf_document.load_page(page_num)
            all_text += extract_text_from_page(page) + "\n"

    # Use Groq API to extract references from the combined text
    references = chain.invoke({"text": all_text})

    # Display the raw output for debugging purposes
    raw_references = references.content.strip().split("\n")
    print(f"Raw extracted references for '{pdf_name}': {raw_references}")

    # Clean the references
    cleaned_references = clean_references(raw_references)

    # Prepare the result
    result = {
        "id": pdf_name,
        "references": cleaned_references
    }

    output_filename = f"{os.path.splitext(pdf_name)[0]}.json"
    output_path = os.path.join(output_dir, output_filename)
    with open(output_path, "w", encoding="utf-8") as output_file:
        json.dump(result, output_file, indent=4, ensure_ascii=False)

    print(f"Saved extracted references to '{output_path}'")

The document '2211.04388.pdf' has 84 pages.
Raw extracted references for '2211.04388.pdf': ['Here are the extracted references:', '', 'Altshuler, R., Grubert, H., & Newlon, T. S. (2000). Has U.S. Investment Abroad Become More Sensitive to Tax Rates? International Taxation and Multinational Activity (pp. 9–38). NBER.', '', 'Alvarez Martinez, M. T., Barrios, S., Bettendorf, L., d’Andria, D., Gesualdo, M., Loretz, S., Pontikakis, D., & Pycroft, J. (2016). A New Calibration for CORTAX: A computable general equilibrium model for simulating corporate tax reforms. JRC Working Papers on Taxation and Structural Reforms.', '', 'Arkolakis, C., Costinot, A., & Rodriguez-Clare, A. (2012). New Trade Models, Same Old Gains? American Economic Review, 102(1), 94–130.', '', 'Arkolakis, C., Ramondo, N., Rodríguez-Clare, A., & Yeaple, S. (2018). Innovation and production in the global economy. American Economic Review, 108(8), 2128–73.', '', 'Auerbach, A. J., Devereux, M. P., Keen, M., & Vella, J. (2017).