In [1]:
import os
import json
from typing import List
import glob
from tqdm import tqdm

import pandas as pd
from IPython.display import display, HTML
from dotenv import load_dotenv
import json

import pymupdf4llm # pymupdf4llm must to be added to the requirememts
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables.base import RunnableSequence

from constants import system_prompts

load_dotenv()

True

### Create the chains for extraction

In [2]:
# Define model and output parser
model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
output_parser = JsonOutputParser()

# Extract PDF text as Markdown
doc_path = "../data/2025/Decision/OROURKE_06-05-25_7891__KING_AND_GEORGE_LLC (DECISION).pdf"
md_text = pymupdf4llm.to_markdown(doc_path)

# Create chat prompt teamplates
fact_pattern_exctraction_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            system_prompts.FACT_PATTERN_EXTRACTION_PROMPT
        ),
        ("user", "{user_input}"),
    ]
)

applicable_rules_of_evidence_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            system_prompts.APPICLABLE_RULES_OF_EVIDENCE_EXTRACTION_PROMPT
        ),
        ("user", "{user_input}"),
    ]
)

substantive_rules_exctraction_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            system_prompts.SUBSTANTIVE_RULES_EXTRACTION_PROMPT
        ),
        ("user", "{user_input}"),
    ]
)

substantive_rules_with_pdf_exctraction_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            system_prompts.SUBSTANTIVE_RULES_WITH_PDF_EXTRACTION_PROMPT
        ),
        ("user", "{user_input}"),
    ]
)


# Create the chains
fact_pattern_exctraction_chain = fact_pattern_exctraction_prompt | model | output_parser
applicable_rules_of_evidence_exctraction_chain = applicable_rules_of_evidence_prompt | model | output_parser
susbstantive_rules_exctraction_chain = substantive_rules_exctraction_prompt | model | output_parser
substantive_rules_with_pdf_exctraction_chain = substantive_rules_with_pdf_exctraction_prompt | model | output_parser

### Fact Pattern and Labels Extraction

In [3]:
def extract_and_save_as_excel_fact_pattern_and_labels(
        doc_path: str,
        output_path: str,
        fact_pattern_exctraction_chain: RunnableSequence,
        applicable_rules_of_evidence_exctraction_chain: RunnableSequence,
        substantive_rules_with_pdf_exctraction_chain: RunnableSequence
    ) -> dict:
    """
    Extracts fact patterns, rules of evidence, and substantive legal rules from a PDF 
    containing a contract dispute decision/order, writes the results to an Excel file,
    and returns the extracted content.

    The input PDF is converted to Markdown format and processed by three chains to extract:
      - Fact patterns
      - Applicable rules of evidence
      - Substantive rules (using both the PDF content and the extracted facts)

    Each result is saved in a separate sheet within a single Excel file. The file is named
    after the original PDF and saved in the specified output directory.

    Args:
        doc_path (str): Path to the input PDF document.
        output_path (str): Directory where the resulting Excel file should be saved.
        fact_pattern_exctraction_chain (RunnableSequence): Chain used to extract fact patterns.
        applicable_rules_of_evidence_exctraction_chain (RunnableSequence): Chain used to extract applicable rules of evidence.
        substantive_rules_with_pdf_exctraction_chain (RunnableSequence): Chain used to extract substantive legal rules.

    Returns:
        dict: A dictionary containing the extracted data:
            - 'fact_pattern' (list): Extracted fact pattern strings.
            - 'applicable_rules' (dict): Extracted applicable rules of evidence.
            - 'substantive_rules' (dict): Extracted substantive legal rules.
    """
    md_text = pymupdf4llm.to_markdown(doc_path)

    fact_pattern_exctraction_input_params = {"user_input": md_text}
    fact_pattern_exctraction_response = fact_pattern_exctraction_chain.invoke(fact_pattern_exctraction_input_params)
    fact_pattern_df = pd.DataFrame(
        {
            "id": list(range(len(fact_pattern_exctraction_response))), 
            "Fact": fact_pattern_exctraction_response
        }
    )

    applicable_rules_of_evidence_exctraction_input_params = { "user_input": fact_pattern_exctraction_response}
    applicable_rules_of_evidence_exctraction_response = applicable_rules_of_evidence_exctraction_chain.invoke(
        applicable_rules_of_evidence_exctraction_input_params
    )
    applicable_rules_of_evidence = pd.DataFrame(applicable_rules_of_evidence_exctraction_response)

    substantive_rules_with_pdf_exctraction_input_params = {
        "pdf_content": md_text,
        "user_input": fact_pattern_exctraction_response
    }
    substantive_rules_with_pdf_exctraction_response = substantive_rules_with_pdf_exctraction_chain.invoke(
        substantive_rules_with_pdf_exctraction_input_params
    )
    substantive_rules = pd.DataFrame(substantive_rules_with_pdf_exctraction_response)

    excel_path = f'{output_path}/{os.path.splitext(os.path.basename(doc_path))[0]}.xlsx'
    with pd.ExcelWriter(excel_path) as writer:
        fact_pattern_df.to_excel(writer, sheet_name='Fact Pattern', index=False)
        applicable_rules_of_evidence.to_excel(writer, sheet_name='Rules of Evidence', index=False)
        substantive_rules.to_excel(writer, sheet_name='Substantive Rules', index=False)

    return {
        "fact_pattern": fact_pattern_exctraction_response,
        "applicable_rules": applicable_rules_of_evidence_exctraction_response,
        "substantive_rules": substantive_rules_with_pdf_exctraction_response
    }


def save_extraction_as_json(results:dict, doc_path: str, output_path: str) -> None:
    json_path = f'{output_path}/{os.path.splitext(os.path.basename(doc_path))[0]}.json'
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

### Extract Fact Pattern and Labels

In [None]:
# Define your input folder with the PDFs and the output folder for Excel/JSON
year = 2025
input_pdf_folder = f"../data/{year}/Decision/"
excel_output_path = f"../data/extracted_excel/{year}/Decision/"
os.makedirs(excel_output_path, exist_ok=True)
json_output_path = f"../data/extracted_json/{year}/Decision/"
os.makedirs(json_output_path, exist_ok=True)

pdf_files_list = glob.glob(os.path.join(input_pdf_folder, "*.pdf"))

# Loop through all PDF files in the input folder
for doc_path in tqdm(pdf_files_list, desc="Processing PDFs"):
    try:
        print(f"Processing: {doc_path}")
        
        facts_and_rules = extract_and_save_as_excel_fact_pattern_and_labels(
            doc_path,
            excel_output_path,
            fact_pattern_exctraction_chain,
            applicable_rules_of_evidence_exctraction_chain,
            substantive_rules_with_pdf_exctraction_chain
        )

        save_extraction_as_json(facts_and_rules, doc_path, json_output_path)

    except Exception as e:
        print(f"Error processing {doc_path}: {e}")
