In [26]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background).
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
driver.verify_connectivity()

ex_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

#create text embedder
embedder = OpenAIEmbeddings()

#define node labels
basic_legal_nodes = [
    "LegalPerson",  # Individuals or legal persons (natural or juristic)
    "Organization",  # Companies, government bodies, NGOs
    "Judge",        # Judicial officers
    "Court",        # Courts or tribunals
    "Location"      # Geographical locations relevant to cases
]


# Case-specific entities
case_law_nodes = [
    "Case",                # The legal case itself
    "LegalPrinciple",      # Doctrine or principle established
    "Statute",            # Legislation cited
    "Regulation",         # Regulatory rules
    "LegalConcept",       # Abstract legal concepts
    "Precedent",          # Previous cases cited as precedent
    "Argument",           # Legal arguments made
    "Remedy",             # Legal remedies ordered
    "Opinion",            # Judicial opinions
    "DissentingOpinion"   # Dissenting judicial opinions
]

node_labels = basic_legal_nodes + case_law_nodes

# define relationship types
rel_types = [
    "CITES",               # Case cites another case/statute
    "OVERRULES",           # Case overrules a precedent
    "DISTINGUISHES",       # Case distinguishes from another case
    "FOLLOWS",             # Case follows a precedent
    "INTERPRETS",          # Case interprets a statute
    "INVOLVES",            # Case involves a particular legal concept
    "HELD_BY",             # Case was decided by a court
    "AUTHORED_BY",         # Judgment authored by judge
    "CONSIDERS",          # Case considers an argument
    "AWARDS",             # Court awards a remedy
    "APPEALS_FROM",       # Case appeals from lower court
    "CONCURS_WITH",       # Judge concurs with opinion
    "DISSENTS_FROM",      # Judge dissents from majority
    "APPLIES",            # Case applies a legal principle
    "ESTABLISHES"         # Case establishes new principle
]

prompt_template = '''
You are a legal researcher tasked with extracting information from UK case law 
and structuring it in a property graph to enable legal research and precedent analysis.

Extract the legal entities (nodes) and specify their type from the following case law text.
Also extract the relationships between these nodes, where the direction goes from the start node to the end node.


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text.  Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''


Unable to retrieve routing information


Connecting to Neo4j at neo4j+s://64d1f1c9.databases.neo4j.io with user neo4j
NEO5J_PASSWORD is set: True


ServiceUnavailable: Unable to retrieve routing information

In [16]:
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=1000, chunk_overlap=200),
    embedder=embedder,
    entities=node_labels,
    relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=False
)

Unable to retrieve routing information
Transaction failed and will be retried in 1.0679197278071986s (Unable to retrieve routing information)


Unable to retrieve routing information
Transaction failed and will be retried in 2.171531962868727s (Unable to retrieve routing information)
Unable to retrieve routing information
Transaction failed and will be retried in 4.612760836019462s (Unable to retrieve routing information)
Unable to retrieve routing information
Transaction failed and will be retried in 9.54961862985137s (Unable to retrieve routing information)


KeyboardInterrupt: 

In [9]:
from data.data_retrieval import DataDownload
from bs4 import BeautifulSoup
import re

data = DataDownload()
paths = data.get_file_paths(year = "2002")
print("paths", paths)
print(len(paths))

xmltexts = []
for path in paths:
    with open(path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'xml')  # Use 'xml' parser for XML files
        text = soup.get_text()
        # Get the main text content (excluding metadata)
        text = re.sub(r'\n{2,}', '\n', text)
        text = re.sub(r' +', ' ', text)
        text = text.strip()
        #remove any line that begins with #judgment
        text = re.sub(r'(?m)^#judgment.*\n?', '', text)
        xmltexts.append((text))
print("Number of XML texts:", len(xmltexts))
print(xmltexts)

paths ['data/court/ewca/ewca%2Fciv/2002/215.xml', 'data/tribunals/ukist/2002/tna.npsycx9q.xml', 'data/tribunals/ukist/2002/tna.npy2cx9q.xml']
3
Number of XML texts: 3
["EWCA-Civil\n2002\n215\n[2002] EWCA Civ 215\n0.26.10\nae961979561f57930ee8a8b00fc12eec3bf925fbc26f5c075ec4ce9ff5ac1bc6\n6.0.2\nC/2001/2146\nNeutral Citation Number: [2002] EWCA Civ 215\nIN THE SUPREME COURT OF JUDICATURE\nCOURT OF APPEAL (CIVIL DIVISION)\nON APPEAL FROM THE HIGH COURT OF JUSTICE\nQUEEN'S BENCH DIVISION\nADMINISTRATIVE COURT\n(DEPUTY JUDGE JACK BEATSON QC)\nRoyal Courts of Justice\nThe Strand\nLondon\nTuesday 5 February 2002\nB e f o r e:\nLORD JUSTICE LAWS\n- - - - - - - - - - - - - - \nB E T W E E N:\nIN THE MATTER OF ANDRE N'GUESSAN\nIN THE MATTER OF THE DRUG TRAFFICKING OFFENCES ACT 1986\n- - - - - - - - - - - - - - \n(Computer Aided Transcription by\nSmith Bernal, 190 Fleet Street, London EC4A 2HD\nTelephone 020 7421 4040\nOfficial Shorthand Writers to the Court)\n- - - - - - - - - - - - - - \nTHE AP

In [None]:

for xmltext in xmltexts:
    print(f"Processing : {path}")
    pdf_result = await kg_builder_pdf.run_async(text=xmltext)
    print(f"Result: {pdf_result}")