# LLM-for-Metadata-Harvesting

This notebook contains the experimental results from [P6: Groot zeegras (2023)](https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/TF1TbsTxTqykP5rv6MXJEg).  
The results can be found under the last code block. Note that not all code is directly relevant to this experiment; some parts are retained for future development and elaboration.


In [1]:
from cheatsheet import CHEATSHEETS
from prompt import PROMPTS
from grobidmonkey import reader
from webutils import readWebContent, downloadAndParseXML

monkeyReader = reader.MonkeyReader('monkey') # or 'lxml' or 'x2d'

# Get the web content
url = "https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/TF1TbsTxTqykP5rv6MXJEg"
soup = readWebContent(url)
if soup is None:
    raise ValueError("Failed to retrieve web content")

# Extract text from the webpage - adjust the selector based on the webpage structure
# This is a basic example - you might need to modify based on the specific webpage
text = soup.get_text(separator='\n', strip=True)

text_xml, _ = downloadAndParseXML("https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/A0h06_NlSEuNlium5OO3FA/formatters/xml")
text += "\n" + text_xml

In [2]:
from openai import OpenAI
from dotenv import load_dotenv
from utils import (
    logger,
    clean_str,
    compute_mdhash_id,
    decode_tokens_by_tiktoken,
    encode_string_by_tiktoken,
    is_float_regex,
    normalize_extracted_info,
    pack_user_ass_to_openai_messages,
    split_string_by_multi_markers,
    use_llm_func_with_cache,
)
from collections import defaultdict

import tiktoken
import re
import os

llm_model = "gpt-4"
load_dotenv()

def chunk_text(text: str, max_tokens: int = 6000) -> list[str]:
    """Split text into chunks that fit within token limit"""
    encoder = tiktoken.encoding_for_model(llm_model)
    tokens = encoder.encode(text)
    chunks = []
    
    current_chunk = []
    current_length = 0
    
    for token in tokens:
        if current_length + 1 > max_tokens:
            # Convert chunk back to text
            chunk_text = encoder.decode(current_chunk)
            chunks.append(chunk_text)
            current_chunk = []
            current_length = 0
        
        current_chunk.append(token)
        current_length += 1
    
    if current_chunk:
        chunks.append(encoder.decode(current_chunk))
    
    return chunks

def extract_entities(text: str, entity_types: list[str], special_interest: str = "") -> dict:
    # Split text into chunks
    chunks = chunk_text(text, max_tokens=4000)  # Leave room for completion
    
    all_nodes = defaultdict(list)
    all_edges = defaultdict(list)
    
    client = OpenAI(
        api_key=os.getenv("OPENAI_API_KEY")
    )

    nightly_entities_prompt = CHEATSHEETS["nightly_entity_template"].format(
        tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
        record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
    )
    
    # Process each chunk
    for chunk in chunks:
        formatted_prompt = {
            "language": "English",
            "tuple_delimiter": PROMPTS["DEFAULT_TUPLE_DELIMITER"],
            "record_delimiter": PROMPTS["DEFAULT_RECORD_DELIMITER"],
            "completion_delimiter": PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
            "entity_types": entity_types,
            "special_interest": special_interest,
            "nightly_entities": nightly_entities_prompt,
            "input_text": chunk
        }
        
        response = client.chat.completions.create(
            model=llm_model,
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI trained to extract entities(meta data fields) and relationships from text."
                },
                {
                    "role": "user",
                    "content": _format_prompt(formatted_prompt)
                }
            ],
            temperature=0.0,
            max_tokens=2000
        )

        print("----------------\nresponse.choices[0].message.content:\n", response.choices[0].message.content)
        
        # Process the chunk results
        nodes, edges = _process_extraction_result(
            response.choices[0].message.content,
            chunk_key=compute_mdhash_id(chunk),
            file_path="unknown_source"
        )
        
        # Merge results
        for key, value in nodes.items():
            all_nodes[key].extend(value)
        for key, value in edges.items():
            all_edges[key].extend(value)
    
    return all_nodes, all_edges
    

def _format_prompt(params: dict) -> str:
    # Format the prompt template with the provided parameters
    prompt_template = CHEATSHEETS["fill_nightly"]
    return prompt_template.format(**params)

def _handle_single_entity_extraction(
    record_attributes: list[str],
    chunk_key: str,
    file_path: str = "unknown_source",
):
    if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
        return None

    # Clean and validate entity name
    entity_name = clean_str(record_attributes[1]).strip('"')
    if not entity_name.strip():
        logger.warning(
            f"Entity extraction error: empty entity name in: {record_attributes}"
        )
        return None

    # Normalize entity name
    entity_name = normalize_extracted_info(entity_name, is_entity=True)

    # Clean and validate entity type
    entity_type = clean_str(record_attributes[2]).strip('"')
    if not entity_type.strip() or entity_type.startswith('("'):
        logger.warning(
            f"Entity extraction error: invalid entity type in: {record_attributes}"
        )
        return None

    # Clean and validate description
    entity_description = clean_str(record_attributes[3])
    entity_description = normalize_extracted_info(entity_description)

    if not entity_description.strip():
        logger.warning(
            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
        )
        return None

    return dict(
        entity_name=entity_name,
        entity_type=entity_type,
        description=entity_description,
        source_id=chunk_key,
        file_path=file_path,
    )


def _handle_single_relationship_extraction(
    record_attributes: list[str],
    chunk_key: str,
    file_path: str = "unknown_source",
):
    if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
        return None
    # add this record as edge
    source = clean_str(record_attributes[1])
    target = clean_str(record_attributes[2])

    # Normalize source and target entity names
    source = normalize_extracted_info(source, is_entity=True)
    target = normalize_extracted_info(target, is_entity=True)

    edge_description = clean_str(record_attributes[3])
    edge_description = normalize_extracted_info(edge_description)

    edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
    edge_source_id = chunk_key
    weight = (
        float(record_attributes[-1].strip('"').strip("'"))
        if is_float_regex(record_attributes[-1])
        else 1.0
    )
    return dict(
        src_id=source,
        tgt_id=target,
        weight=weight,
        description=edge_description,
        keywords=edge_keywords,
        source_id=edge_source_id,
        file_path=file_path,
    )

def _process_extraction_result(
        result: str, chunk_key: str, file_path: str = "unknown_source"
    ):
        """Process a single extraction result (either initial or gleaning)
        Args:
            result (str): The extraction result to process
            chunk_key (str): The chunk key for source tracking
            file_path (str): The file path for citation
        Returns:
            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
        """
        context_base = dict(
            tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
            record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
            completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
        )
        maybe_nodes = defaultdict(list)
        maybe_edges = defaultdict(list)

        records = split_string_by_multi_markers(
            result,
            [context_base["record_delimiter"], context_base["completion_delimiter"], "\n"],
        )

        print(f"Extracted records: {records}\n")

        for record in records:
            print(f"Processing record: {record}")
            # Add parentheses if they don't exist
            if not record.startswith('('):
                record = f'({record})'
            if not record.endswith(')'):
                record = f'{record})'
            record = re.search(r"\((.*)\)", record)
            if record is None:
                print(
                    f"Record extraction error: invalid record format in: {record}"
                )
                continue
            record = record.group(1)
            record_attributes = split_string_by_multi_markers(
                record, [context_base["tuple_delimiter"]]
            )

            if_entities = _handle_single_entity_extraction(
                record_attributes, chunk_key, file_path
            )
            if if_entities is not None:
                maybe_nodes[if_entities["entity_name"]].append(if_entities)
                continue

            if_relation = _handle_single_relationship_extraction(
                record_attributes, chunk_key, file_path
            )
            if if_relation is not None:
                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
                    if_relation
                )

        return maybe_nodes, maybe_edges

special_interest = CHEATSHEETS.get("special_interests", "Focus on metadata fields and their relationships")
output_nodes, output_edges = extract_entities(
    text=text, 
    entity_types=PROMPTS["DEFAULT_ENTITY_TYPES"], 
    special_interest=special_interest
)

----------------
response.choices[0].message.content:
 ("entity"<|>"Metadata Date"<|>"Metadata date"<|>"The metadata date is 2023-12-01."<|>"P6: Groot zeegras (2023)"<|>"https://open.rijkswaterstaat.nl/overige-publicaties/2024/metadatabijsluiter-zeegrasdata/")##
("entity"<|>"Metadata Language"<|>"Metadata language"<|>"The metadata language is Nederlands; Vlaams."<|>"P6: Groot zeegras (2023)"<|>"https://open.rijkswaterstaat.nl/overige-publicaties/2024/metadatabijsluiter-zeegrasdata/")##
("entity"<|>"Responsible Organization Metadata"<|>"Responsible organization metadata"<|>"The responsible organization for the metadata is Rijkswaterstaat CIV."<|>"P6: Groot zeegras (2023)"<|>"https://open.rijkswaterstaat.nl/overige-publicaties/2024/metadatabijsluiter-zeegrasdata/")##
("entity"<|>"Landing Page"<|>"Landing page"<|>"The landing page is the Wadden viewer."<|>"P6: Groot zeegras (2023)"<|>"https://open.rijkswaterstaat.nl/overige-publicaties/2024/metadatabijsluiter-zeegrasdata/")##
("entity"<|>

In [3]:
# Create a dictionary to store entity_type: [(entity_name, description), ...]
entity_type_map = {}

for entity_group in output_nodes.values():
    for item in entity_group:
        entity_type = item.get('entity_type')
        entity_name = item.get('entity_name')
        description = item.get('description')

        # Initialize the list for this entity_type if not already present
        if entity_type not in entity_type_map:
            entity_type_map[entity_type] = []

        # Append the (entity_name, description) pair
        entity_type_map[entity_type].append((entity_name, description))

# Example: print results
for entity_type, pairs in entity_type_map.items():
    print(f"{entity_type}:")
    for name, desc in pairs:
        print(f"  - ({name}, {desc})")

Metadata date:
  - (Metadata Date, The metadata date is 2023-12-01.)
Metadata language:
  - (Metadata Language, The metadata language is Nederlands; Vlaams.)
  - (Metadata Language, The metadata language is Dutch.)
Responsible organization metadata:
  - (Responsible Organization Metadata, The responsible organization for the metadata is Rijkswaterstaat CIV.)
Landing page:
  - (Landing Page, The landing page is the Wadden viewer.)
Title:
  - (Title, The title of the data is 'P6: Groot zeegras (2023)'.)
Description:
  - (Description, The description of the data is about the Zeegras-database which contains various data per 20x20meter surface unit.)
Unique Identifier:
  - (Unique Identifier, The unique identifier of the data is TF1TbsTxTqykP5rv6MXJEg.)
Resource type:
  - (Resource Type, The resource type is a Dataset.)
Keywords:
  - (Keywords, The keywords associated with the data are Zeegras, TMAP, Trilaterale Waddenzee, UNESCO Werelderfgoed, Natuurwaarde, Ecotopen.)
  - (Topic Category, 