## Multi-agent Test

LLM-based Text Extractor
- Fetch & Store Data in Graph Database (`neo4j`)



Since 2025

v. 0.2.2

## Code

In [2]:
import glob
import os
from typing import List, Optional

from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from neo4j import GraphDatabase
from pydantic import BaseModel
import json

# Load environment variables
load_dotenv()
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USERNAME = os.environ["NEO4J_USERNAME"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Base directory containing the CLARK library
CLARK_BASE_DIR = "rag/clark_doc"

# Pydantic model for course data
class CourseData(BaseModel):
    course_name: str
    collection_name: str
    updated_time: Optional[str] = None
    contributors: Optional[List[str]] = None
    academic_levels: Optional[List[str]] = None
    topic: Optional[str] = None
    url_link: Optional[str] = None
    description: Optional[str] = None
    outcomes: Optional[List[str]] = None
    alignment: Optional[List[str]] = None

# Neo4j driver setup
class Neo4jDriver:
    def __init__(self, uri: str, username: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(username, password))

    def close(self):
        self.driver.close()

    def create_course(self, course_data: CourseData):
        with self.driver.session() as session:
            session.run(
                """
                MERGE (c:Course {name: $course_name})
                SET c.updated_time = $updated_time,
                    c.url_link = $url_link,
                    c.description = $description
                WITH c
                MERGE (col:Collection {name: $collection_name})
                MERGE (c)-[:BELONGS_TO]->(col)
                """,
                course_name=course_data.course_name,
                collection_name=course_data.collection_name,
                updated_time=course_data.updated_time,
                url_link=course_data.url_link,
                description=course_data.description,
            )
            if course_data.contributors:
                for contributor in course_data.contributors:
                    session.run(
                        """
                        MERGE (p:Contributor {name: $contributor})
                        MERGE (c:Course {name: $course_name})
                        MERGE (p)-[:CONTRIBUTED_TO]->(c)
                        """,
                        contributor=contributor,
                        course_name=course_data.course_name,
                    )
            if course_data.academic_levels:
                for level in course_data.academic_levels:
                    session.run(
                        """
                        MERGE (l:AcademicLevel {name: $level})
                        MERGE (c:Course {name: $course_name})
                        MERGE (c)-[:TARGETS]->(l)
                        """,
                        level=level,
                        course_name=course_data.course_name,
                    )
            if course_data.topic:
                session.run(
                    """
                    MERGE (t:Topic {name: $topic})
                    MERGE (c:Course {name: $course_name})
                    MERGE (c)-[:COVERS]->(t)
                    """,
                    topic=course_data.topic,
                    course_name=course_data.course_name,
                )
            if course_data.outcomes:
                for outcome in course_data.outcomes:
                    session.run(
                        """
                        MERGE (o:Outcome {description: $outcome})
                        MERGE (c:Course {name: $course_name})
                        MERGE (c)-[:ACHIEVES]->(o)
                        """,
                        outcome=outcome,
                        course_name=course_data.course_name,
                    )
            if course_data.alignment:
                for align in course_data.alignment:
                    session.run(
                        """
                        MERGE (a:Alignment {name: $align})
                        MERGE (c:Course {name: $course_name})
                        MERGE (c)-[:ALIGNS_WITH]->(a)
                        """,
                        align=align,
                        course_name=course_data.course_name,
                    )

# LLM setup
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

def clean_llm_response(response: str) -> str:
    """Remove Markdown code block markers from LLM response."""
    # Strip ```json and ``` markers, including optional whitespace
    cleaned = response.strip()
    if cleaned.startswith("```json"):
        cleaned = cleaned[len("```json"):].strip()
    if cleaned.endswith("```"):
        cleaned = cleaned[:-len("```")].strip()
    # Also handle plain ``` if used without 'json'
    if cleaned.startswith("```"):
        cleaned = cleaned[len("```"):].strip()
    return cleaned

def extract_data_from_readme(pdf_path: str) -> CourseData:
    """Extract structured data from a README.pdf using LLM."""
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    text = "\n".join(page.page_content for page in pages)

    # Extract path-based metadata as fallback
    parts = pdf_path.split(os.sep)
    if len(parts) >= 3:
        collection_name = parts[-3]
        course_name = parts[-2]
    else:
        collection_name = "unknown_collection"
        course_name = "unknown_course"

    # LLM prompt with strict JSON instructions
    prompt = f"""
    You are an expert data extractor. From the following README text, extract these fields into a valid JSON object:
    - course_name (string)
    - collection_name (string)
    - updated_time (string or null)
    - contributors (list of strings)
    - academic_levels (list of strings)
    - topic (string or null)
    - url_link (string or null)
    - description (string or null)
    - outcomes (list of strings)
    - alignment (list of strings)

    Rules:
    - Return ONLY a valid JSON string, no additional text or explanations.
    - Use null for missing single-value fields (e.g., updated_time).
    - Use empty lists for missing list fields (e.g., contributors).
    - For multi-line sections (e.g., Description, Outcomes), concatenate or list items as appropriate.
    - Handle multi-page content (e.g., Alignment) by combining relevant lines.

    Text:
    {text}
    """

    response = llm.invoke(prompt)
    cleaned_response = clean_llm_response(response.content)
    
    try:
        extracted_data = json.loads(cleaned_response)
    except json.JSONDecodeError as e:
        print(f"Error parsing LLM response for {pdf_path}: {e}")
        print(f"Cleaned LLM response: {cleaned_response}")
        # Fallback to minimal data
        extracted_data = {
            "course_name": course_name,
            "collection_name": collection_name,
            "updated_time": None,
            "contributors": [],
            "academic_levels": [],
            "topic": None,
            "url_link": None,
            "description": None,
            "outcomes": [],
            "alignment": [],
        }

    # Ensure all fields are present and typed correctly
    data = {
        "course_name": extracted_data.get("course_name", course_name),
        "collection_name": extracted_data.get("collection_name", collection_name),
        "updated_time": extracted_data.get("updated_time"),
        "contributors": extracted_data.get("contributors", []) or [],
        "academic_levels": extracted_data.get("academic_levels", []) or [],
        "topic": extracted_data.get("topic"),
        "url_link": extracted_data.get("url_link"),
        "description": extracted_data.get("description"),
        "outcomes": extracted_data.get("outcomes", []) or [],
        "alignment": extracted_data.get("alignment", []) or [],
    }

    return CourseData(**data)

def load_readme_files_into_neo4j(base_dir: str, neo4j_driver: Neo4jDriver):
    readme_files = glob.glob(f"{base_dir}/*/*/README.pdf")
    if not readme_files:
        print(f"No README.pdf files found in {base_dir}/*/*/README.pdf")
        return

    print(f"Found {len(readme_files)} README.pdf files")
    for readme_path in readme_files:
        try:
            course_data = extract_data_from_readme(readme_path)
            neo4j_driver.create_course(course_data)
            print(f"Loaded data from {readme_path} into Neo4j")
        except Exception as e:
            print(f"Error processing {readme_path}: {e}")

def main():
    neo4j_driver = Neo4jDriver(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)
    load_readme_files_into_neo4j(CLARK_BASE_DIR, neo4j_driver)
    neo4j_driver.close()

if __name__ == "__main__":
    main()

Found 10 README.pdf files
Loaded data from rag/clark_doc\ncaec\Computer and NW Security - Undergrad\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Computer Security\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\CyberSkills2Work - Incident Response (DSU-009)\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Digital Forensics\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Network Defense\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Operating System Hardening\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Privacy\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Programming\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Secure Software Development\README.pdf into Neo4j
Loaded data from rag/clark_doc\ncaec\Software Security\README.pdf into Neo4j
