# LLMs

In [1]:
from langchain_openai import AzureChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

AZURE_DEPLOYMENT_GPT41 = os.getenv("AZURE_DEPLOYMENT_GPT41")
AZURE_DEPLOYMENT_GPT41_NANO = os.getenv("AZURE_DEPLOYMENT_GPT41_NANO")

gpt41_nano = AzureChatOpenAI(
    azure_deployment=AZURE_DEPLOYMENT_GPT41_NANO,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY
)



In [2]:
from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

from langchain_groq import ChatGroq

groqllm = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=GROQ_API_KEY
)

# Ontology Rules

In [3]:
import re
def extract_ontology_lexical_view(ontology_path: str) -> list:
    """
    Returns format: ['label: comment', 'label2: comment2']
    Pairs labels with their corresponding comments
    """
    with open(ontology_path, 'r', encoding='utf-8') as f:
        content = f.read()
        
    lexical_view = []
    
    # Find all rdfs:label and rdfs:comment matches with their positions
    label_pattern = r'rdfs:label\s+"([^"]+)"'
    comment_pattern = r'rdfs:comment\s+"([^"]+)"'
    
    # Get all matches in order
    labels = [match.group(1).replace(" ", "") for match in re.finditer(label_pattern, content)]
    comments = [match.group(1) for match in re.finditer(comment_pattern, content)]

    # Pair them sequentially (comment[i] goes with label[i])
    for i in range(len(labels)):
        if i < len(comments):
            lexical_view.append(f"{labels[i]}: {comments[i]}")
        else:
            lexical_view.append(f"{labels[i]}: ")
    
    return lexical_view

In [None]:
extract_ontology_lexical_view('../../output/ontologies/rdb/rigor_ontology_few_fixes.ttl')

['ActivityDate: Date when the company commenced operations or activity.',
 'Address: Physical address of the company.',
 'AgentLanguage: Preferred language of the agent or representative.',
 'AIReviewState: State of AI-based review process for the application.',
 'AnnualRevenue: Total annual revenue of the company.',
 'ApplicationIdentifier: Unique identifier for the granter application.',
 'ApplicationState: Current state of the application (e.g., submitted, under review, approved).',
 'ApprovedGrantAmount: The amount of grant approved for the application.',
 'CAE: Custom attribute or code related to the company.',
 'City: City where the company is located.',
 'CodeName: Internal code name for the application.',
 'Company: Identifier linking to internal or external company references.',
 'Consortium: Associated consortium for the application, if any.',
 'Country: Country where the company is registered or located.',
 'CoverImage: URL to an image representing the company.',
 'CreationT

In [None]:
from rdflib import Graph, RDF, OWL

def extract_local_names(ttl_file_path):
    """
    Extract ontology elements (local names without full URIs).
    Returns:
        - class_names: ontology classes (entities)
        - object_property_names: ontology object properties (relationships)
        - datatype_property_names: ontology datatype properties (attributes)
    """
    g = Graph()
    g.parse(ttl_file_path, format="turtle")
    
    def get_local_name(uri):
        uri_str = str(uri)
        if '#' in uri_str: 
            return uri_str.split('#')[-1]
        return uri_str.split('/')[-1]
    
    # Classes
    class_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.Class))]
    class_names = [get_local_name(uri) for uri in class_uris]
    
    # Object properties
    object_prop_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.ObjectProperty))]
    object_property_names = [get_local_name(uri) for uri in object_prop_uris]
    
    # Datatype properties
    data_prop_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.DatatypeProperty))]
    datatype_property_names = [get_local_name(uri) for uri in data_prop_uris]
    
    return (
        sorted(class_names),
        sorted(object_property_names),
        sorted(datatype_property_names),
    )

# Usage
classes, relations, attributes = extract_local_names('../../output/ontologies/rdb/rigor_ontology_few_fixes.ttl')

print(f"Found {len(classes)} classes, {len(relations)} object properties, {len(attributes)} datatype properties")
print("\nClass names:", classes)
print("\nObject properties (relations):", relations)
print("\nDatatype properties (attributes):", attributes)

Found 26 classes, 43 object properties, 137 datatype properties

Class names: ['DatabaseTable', 'ForeignKeyRelation', 'GeographicCriterion', 'Grant', 'GrantPayment', 'GrantShare', 'GranterApplication', 'GranterCompany', 'GranterCompanyMemory', 'GranterConsortium', 'GranterConsortiumPartner', 'GranterConsortiumPartnerType', 'GranterGeneralOpportunityFile', 'GranterGeneralOpportunityFileOpportunities', 'GranterMatchcheck', 'GranterMatchgroup', 'GranterOpportunity', 'GranterOpportunityFile', 'GranterTimeline', 'GrantersApplicationFile', 'Opportunity', 'PersonLocation', 'ProvenanceInfo', 'Recipient', 'granterCompanyMemory', 'provenanceInfo']

Object properties (relations): ['CompanyId', 'ConsortiumId', 'OpportunityId', 'OwnerId', 'ProfileCreatorId', 'belongsToApplication', 'belongsToCompany', 'belongsToCompanyMemory', 'belongsToEligibilityCriteria', 'belongsToOpportunity', 'belongsToProfile', 'belongsToSubscription', 'belongsToWhitelabel', 'companyId', 'fundedBy', 'granter_matchgroup', 'ha

In [14]:
classes

['DatabaseTable',
 'ForeignKeyRelation',
 'GeographicCriterion',
 'Grant',
 'GrantPayment',
 'GrantShare',
 'GranterApplication',
 'GranterCompany',
 'GranterCompanyMemory',
 'GranterConsortium',
 'GranterConsortiumPartner',
 'GranterConsortiumPartnerType',
 'GranterGeneralOpportunityFile',
 'GranterGeneralOpportunityFileOpportunities',
 'GranterMatchcheck',
 'GranterMatchgroup',
 'GranterOpportunity',
 'GranterOpportunityFile',
 'GranterTimeline',
 'GrantersApplicationFile',
 'Opportunity',
 'PersonLocation',
 'ProvenanceInfo',
 'Recipient',
 'granterCompanyMemory',
 'provenanceInfo']

In [15]:
relations

['CompanyId',
 'ConsortiumId',
 'OpportunityId',
 'OwnerId',
 'ProfileCreatorId',
 'belongsToApplication',
 'belongsToCompany',
 'belongsToCompanyMemory',
 'belongsToEligibilityCriteria',
 'belongsToOpportunity',
 'belongsToProfile',
 'belongsToSubscription',
 'belongsToWhitelabel',
 'companyId',
 'fundedBy',
 'granter_matchgroup',
 'hasApplication',
 'hasAssociatedFile',
 'hasCompanyId',
 'hasConsortiumId',
 'hasForeignKey',
 'hasForeignKeyRelation',
 'hasGeneralOpportunityFileId',
 'hasGrant',
 'hasGrantShare',
 'hasMemorySnippet',
 'hasOpportunityFile',
 'hasOpportunityId',
 'hasPartner',
 'hasPartnerTypeId',
 'hasProvenance',
 'hasProvenanceInfo',
 'hasRecipient',
 'hasTimeline',
 'includesPartner',
 'opportunityId',
 'recipient',
 'relatesTo',
 'relatesToCompany',
 'relatesToConsortium',
 'relatesToOpportunity',
 'relatesToTable',
 'wasDerivedFrom']

In [20]:
relations.remove('relatesToTable')
relations

['CompanyId',
 'ConsortiumId',
 'OpportunityId',
 'OwnerId',
 'ProfileCreatorId',
 'belongsToApplication',
 'belongsToCompany',
 'belongsToCompanyMemory',
 'belongsToEligibilityCriteria',
 'belongsToOpportunity',
 'belongsToProfile',
 'belongsToSubscription',
 'belongsToWhitelabel',
 'companyId',
 'fundedBy',
 'granter_matchgroup',
 'hasApplication',
 'hasAssociatedFile',
 'hasCompanyId',
 'hasConsortiumId',
 'hasForeignKey',
 'hasForeignKeyRelation',
 'hasGeneralOpportunityFileId',
 'hasGrant',
 'hasGrantShare',
 'hasMemorySnippet',
 'hasOpportunityFile',
 'hasOpportunityId',
 'hasPartner',
 'hasPartnerTypeId',
 'hasProvenance',
 'hasProvenanceInfo',
 'hasRecipient',
 'hasTimeline',
 'includesPartner',
 'opportunityId',
 'recipient',
 'relatesTo',
 'relatesToCompany',
 'relatesToConsortium',
 'relatesToOpportunity',
 'wasDerivedFrom']

# Input text

In [42]:
from langchain_core.documents import Document
import os

input_txt_folder = "data/texts"

# Import opportunity file 
for filename in os.listdir(input_txt_folder):
    if filename.endswith('opportunity_example.txt'):
        file_path = os.path.join(input_txt_folder, filename)
        print(f"Processing {filename}...")

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                print(type(content))
        except Exception as e:
            print(f"Error processing {filename}: {e}")

document = Document(page_content=content)
print(document)

Processing opportunity_example.txt...
<class 'str'>
page_content='Os Fundos Europeus mais próximos de si.  
           
 
          1/23 
Aviso para apresentação de  candidaturas  
Código do aviso   MAR2030 -2023 -22 
Data de publicação  14/11 /2023  
Natureza do aviso   Concurso  
Âmbito de atuação:  Operações  
Aprovado pelo SRMAR a 25/07/2023  
 
 
Designação do  aviso  
Transformação  de Produtos da Pesca e da Aquicultura no Domínio dos Investimentos Produtivos -  Região 
Autónoma da Madeira  
 
Apoio para  
Promover a comercialização, a qualidade, o valor acrescentado dos produtos da pesca e da aquicultura, assim 
como a transformação destes produtos . 
Ações abrangidas por este aviso  
São abrangidas pelo presente aviso as ações, promovidas por empresas, previstas no artigo 50.º da Portaria n.º 
559/2023, de 25 de julho , relativas a : 
a) Investimentos produtivos bem como investimentos que promovam a descarbonização, o uso de energias 
renováveis e a eficiência energética, a eco

## Chunking it

In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_document(document, chunk_size=3000, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents([document])

chunks = chunk_document(document)
print(chunks[0].page_content)

Os Fundos Europeus mais próximos de si.  
           
 
          1/23 
Aviso para apresentação de  candidaturas  
Código do aviso   MAR2030 -2023 -22 
Data de publicação  14/11 /2023  
Natureza do aviso   Concurso  
Âmbito de atuação:  Operações  
Aprovado pelo SRMAR a 25/07/2023  
 
 
Designação do  aviso  
Transformação  de Produtos da Pesca e da Aquicultura no Domínio dos Investimentos Produtivos -  Região 
Autónoma da Madeira  
 
Apoio para  
Promover a comercialização, a qualidade, o valor acrescentado dos produtos da pesca e da aquicultura, assim 
como a transformação destes produtos . 
Ações abrangidas por este aviso  
São abrangidas pelo presente aviso as ações, promovidas por empresas, previstas no artigo 50.º da Portaria n.º 
559/2023, de 25 de julho , relativas a : 
a) Investimentos produtivos bem como investimentos que promovam a descarbonização, o uso de energias 
renováveis e a eficiência energética, a economia circular, a digitalização e a internacionalização,  
b) Inve

# Prompt

In [15]:
test_sentence = chunks[0].page_content
print(test_sentence)

European Funds closer to you.  
           
 
          1/23
Call for applications  
Call code   MAR2030 -2023 -22 
Date of publication  14/11/2023  
Nature of the notice   Tender  
Scope of action:  Operations  
Approved by SRMAR on 25/07/2023  
 
 
Name of the  notice  
Processing  of Fishery and Aquaculture Products in the Field of Productive Investments -  Autonomous Region 
of Madeira  
 
Support to  
promote the marketing, quality, and added value of fishery and aquaculture products, as well as 
the processing of these products . 
Actions covered by this notice  
This notice covers actions promoted by companies, as provided for in Article 50 of Ordinance No. 
559/2023 of July 25, relating to: 
a) Productive investments as well as investments that promote decarbonization, the use of renewable energy 
and energy efficiency, the circular economy, digitization, and internationalization,  
b) Investments that promote entrepreneurship by supporting the creation and development of start

In [16]:
examples = [
    {'sentence': '<id> The technology company submitted their grant application for AI research projects.', 
    'answer': '(technology company, appliedFor, grant application)'},

    {'sentence': '<id> The granter opportunity requires applicants to meet the organisational criterion of having at least 50 employees.', 
    'answer': '(granter opportunity, requires, organisational criterion)\n(applicants, hasEligibility, organisational criterion)'},

    {'sentence': '<id> The healthcare company applied for a grant that offers payments of up to $500,000.', 
    'answer': '(healthcare company, appliesFor, grant)\n(grant, offers, grant payment)'},

    {'sentence': '<id> The environmental grant has a deadline of September 30 for all project submissions.', 
    'answer': '(environmental grant, hasDeadline, September 30)\n(environmental grant, requires, project submissions)'},

    {'sentence': '<id> The granter application file was evaluated by an AI review state before final approval.', 
    'answer': '(AI review state, hasEvaluates, granter application file)\n(granter application file, aiReviewState, AI review state)'},
]


In [17]:
# - Allowed Attributes (datatype properties): {attributes_str}
# - Attribute triple: (entity, attribute, literal_value)

def generate_prompt(concepts, relations, attributes, examples, test_sentence):
    """
    Build an ontology-guided extraction prompt for LLMs.
    
    Args:
        concepts (list[str]): Ontology classes (entities).
        relations (list[str]): Ontology object properties (relationships).
        attributes (list[str]): Ontology datatype properties (attributes).
        examples (list[dict]): Few-shot examples with keys {"sentence": str, "answer": str}.
        test_sentence (str): The new sentence to extract triples from.
    
    Returns:
        str: The formatted LLM prompt.
    """
    # Format ontology context
    concepts_str = ", ".join(concepts)
    relations_str = ", ".join(relations)
    attributes_str = ", ".join(attributes)

    # Format examples
    examples_str = "\n\n".join([
        f"Example sentence: {ex['sentence']}\nExample answer: {ex['answer']}"
        for ex in examples
    ])
    
    # Build complete prompt
    return f"""Given the following ontology and sentence, please extract the triples from the sentence according to the relations in the ontology.
In the output, only include the triples in the given output format

Context:
- Ontology Entities (classes): {concepts_str}
- Ontology Relations (object properties): {relations_str}

For each input sentence, output triples in the following formats:
- Relationship triple: (subject_entity, relationship, object_entity)

{examples_str}

Given these examples, give the correct answer for the following sentence:
Test sentence:\n{test_sentence}\n
Test answer:"""

print(generate_prompt(classes, relations, attributes, examples, test_sentence))

Given the following ontology and sentence, please extract the triples from the sentence according to the relations in the ontology.
In the output, only include the triples in the given output format

Context:
- Ontology Entities (classes): DatabaseTable, ForeignKeyRelation, GeographicCriterion, Grant, GrantPayment, GrantShare, GranterApplication, GranterCompany, GranterCompanyMemory, GranterConsortium, GranterConsortiumPartner, GranterConsortiumPartnerType, GranterGeneralOpportunityFile, GranterGeneralOpportunityFileOpportunities, GranterMatchcheck, GranterMatchgroup, GranterOpportunity, GranterOpportunityFile, GranterTimeline, GrantersApplicationFile, Opportunity, PersonLocation, ProvenanceInfo, Recipient, granterCompanyMemory, provenanceInfo
- Ontology Relations (object properties): CompanyId, ConsortiumId, OpportunityId, OwnerId, ProfileCreatorId, belongsToApplication, belongsToCompany, belongsToCompanyMemory, belongsToEligibilityCriteria, belongsToOpportunity, belongsToProfile, bel

# Trying without structured output

In [33]:
answer = gpt41_nano.invoke(generate_prompt(classes, relations, attributes, examples, test_sentence))

In [34]:
print(answer.content)

(points to note: The sentence is about "European Funds" which are not explicitly linked to specific entities in the ontology, but the context suggests a funding source. Since the ontology includes the relation "fundedBy," and "European Funds" implies a funding entity, the relevant triple would link the funding source to the related project or entity. However, the sentence doesn't specify a project or entity explicitly. Given the data, the best extraction is that "European Funds" fund some activity or project related to the notice about fishery and aquaculture products and investments.)

*(European Funds, fundedBy, project or activity related to fishery and aquaculture investments)*

But since no specific entity in the ontology directly corresponds to "European Funds," and no explicit project or activity is named, a minimal valid triple based on the context is:

(European Funds, hasScope, closer to you)

or more precisely, considering the sentence's emphasis:

(European Funds, relatesTo

# With LLMGraphTransformer

In [44]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
graph_transformer = LLMGraphTransformer(llm = gpt41_nano,
                                        allowed_nodes = classes,
                                        allowed_relationships = relations)

In [45]:
graph_documents = await graph_transformer.aconvert_to_graph_documents(chunks)

In [46]:
graph_documents

[GraphDocument(nodes=[Node(id='Mar2030-2023-22', type='Grant', properties={}), Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), Node(id='14/11/2023', type='Provenanceinfo', properties={}), Node(id='Operações', type='Geographiccriterion', properties={}), Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={})], relationships=[Relationship(source=Node(id='Mar2030-2023-22', type='Grant', properties={}), target=Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), type='HASAPPLICATION', properties={}), Relationship(source=Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), target=Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), type='BELONGSTOAPPLICAT

In [47]:
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Mar2030-2023-22', type='Grant', properties={}), Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), Node(id='14/11/2023', type='Provenanceinfo', properties={}), Node(id='Operações', type='Geographiccriterion', properties={}), Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={})]
Relationships:[Relationship(source=Node(id='Mar2030-2023-22', type='Grant', properties={}), target=Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), type='HASAPPLICATION', properties={}), Relationship(source=Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), target=Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), type='BELONGSTOAPPLICATION', properties

In [48]:
def merge_graph_documents(graph_documents):
    """Simple function to merge all nodes and relationships from graph documents."""
    all_nodes = []
    all_relationships = []
    
    # Collect everything
    for doc in graph_documents:
        all_nodes.extend(doc.nodes)
        all_relationships.extend(doc.relationships)
    
    # Remove duplicate nodes by ID
    seen_nodes = set()
    unique_nodes = []
    for node in all_nodes:
        if node.id not in seen_nodes:
            seen_nodes.add(node.id)
            unique_nodes.append(node)
    
    # Remove duplicate relationships
    seen_rels = set()
    unique_relationships = []
    for rel in all_relationships:
        rel_key = (rel.source.id, rel.target.id, rel.type)
        if rel_key not in seen_rels:
            seen_rels.add(rel_key)
            unique_relationships.append(rel)
    
    # Create merged document
    merged_doc = type(graph_documents[0])(
        nodes=unique_nodes,
        relationships=unique_relationships,
        source=graph_documents[0].source
    )
    
    print(f"Merged: {len(unique_nodes)} nodes, {len(unique_relationships)} relationships")
    return [merged_doc]

# Use it
graph_documents = merge_graph_documents(graph_documents)
graph_documents

Merged: 132 nodes, 43 relationships


[GraphDocument(nodes=[Node(id='Mar2030-2023-22', type='Grant', properties={}), Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), Node(id='14/11/2023', type='Provenanceinfo', properties={}), Node(id='Operações', type='Geographiccriterion', properties={}), Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), Node(id='Região_Autónoma_Da_Madeira', type='Geographiccriterion', properties={}), Node(id='Programa Mar 2030', type='Grant', properties={}), Node(id='+ 351 291 203 250', type='Personlocation', properties={}), Node(id='Drp@Madeira.Gov.Pt', type='Personlocation', properties={}), Node(id='Fundos Europeus Mais Próximos De Si', type='Grant', properties={}), Node(id='Promover A Comercialização, A Qualidade, O Valor Acrescentado Dos Produtos Da Pesca E Da Aquicultura, Assim Como A Transformação Destes Produtos', type='Grant', properties={})

In [49]:
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Mar2030-2023-22', type='Grant', properties={}), Node(id='Transformação De Produtos Da Pesca E Da Aquicultura No Domínio Dos Investimentos Produtivos - Região Autónoma Da Madeira', type='Grant', properties={}), Node(id='14/11/2023', type='Provenanceinfo', properties={}), Node(id='Operações', type='Geographiccriterion', properties={}), Node(id='Aviso Para Apresentação De Candidaturas', type='Granterapplication', properties={}), Node(id='Região_Autónoma_Da_Madeira', type='Geographiccriterion', properties={}), Node(id='Programa Mar 2030', type='Grant', properties={}), Node(id='+ 351 291 203 250', type='Personlocation', properties={}), Node(id='Drp@Madeira.Gov.Pt', type='Personlocation', properties={}), Node(id='Fundos Europeus Mais Próximos De Si', type='Grant', properties={}), Node(id='Promover A Comercialização, A Qualidade, O Valor Acrescentado Dos Produtos Da Pesca E Da Aquicultura, Assim Como A Transformação Destes Produtos', type='Grant', properties={}), Node(id='Prog

## Visualize

In [50]:
from pyvis.network import Network

def visualize_graph(graph_documents):

    # Create network
    net = Network(height="1200px", width="100%", directed=True,
                      notebook=False, bgcolor="#222222", font_color="white")
    
    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships

    # Build lookup for valid nodes
    node_dict = {node.id: node for node in nodes}
    
    # Filter out invalid edges and collect valid node IDs
    valid_edges = []
    valid_node_ids = set()
    for rel in relationships:
        if rel.source.id in node_dict and rel.target.id in node_dict:
            valid_edges.append(rel)
            valid_node_ids.update([rel.source.id, rel.target.id])


    # Track which nodes are part of any relationship
    connected_node_ids = set()
    for rel in relationships:
        connected_node_ids.add(rel.source.id)
        connected_node_ids.add(rel.target.id)

    # Add valid nodes
    for node_id in valid_node_ids:
        node = node_dict[node_id]
        try:
            net.add_node(node.id, label=node.id, title=node.type, group=node.type)
        except:
            continue  # skip if error

    # Add valid edges
    for rel in valid_edges:
        try:
            net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
        except:
            continue  # skip if error

    # Configure physics
    net.set_options("""
            {
                "physics": {
                    "forceAtlas2Based": {
                        "gravitationalConstant": -100,
                        "centralGravity": 0.01,
                        "springLength": 200,
                        "springConstant": 0.08
                    },
                    "minVelocity": 0.75,
                    "solver": "forceAtlas2Based"
                }
            }
            """)
        
    output_file = "knowledge_graph.html"
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")

    # Try to open in browser
    try:
        import webbrowser
        webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except:
        print("Could not open browser automatically")
        
# Run the function
visualize_graph(graph_documents)

Graph saved to c:\Users\tiago\Documents\Granter Ai Internship\Implementation\KGs_for_Vertical_AI\knowledge_graph.html


# Saving to csv

In [None]:
import re
from rdflib import Graph, RDF, OWL
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from pyvis.network import Network
import pandas as pd

import asyncio

from langchain_openai import AzureChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION")

AZURE_DEPLOYMENT_GPT41 = os.getenv("AZURE_DEPLOYMENT_GPT41")
AZURE_DEPLOYMENT_GPT41_NANO = os.getenv("AZURE_DEPLOYMENT_GPT41_NANO")

gpt41_nano = AzureChatOpenAI(
    azure_deployment=AZURE_DEPLOYMENT_GPT41_NANO,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    temperature = 0.0
)


# ---------------------------------- TOOLS ----------------------------------


def extract_ontology_lexical_view(ontology_path: str) -> list:
    """
    Returns format: ['label: comment', 'label2: comment2']
    Pairs labels with their corresponding comments
    """
    with open(ontology_path, 'r', encoding='utf-8') as f:
        content = f.read()
        
    lexical_view = []
    
    # Find all rdfs:label and rdfs:comment matches with their positions
    label_pattern = r'rdfs:label\s+"([^"]+)"'
    comment_pattern = r'rdfs:comment\s+"([^"]+)"'
    
    # Get all matches in order
    labels = [match.group(1).replace(" ", "") for match in re.finditer(label_pattern, content)]
    comments = [match.group(1) for match in re.finditer(comment_pattern, content)]

    # Pair them sequentially (comment[i] goes with label[i])
    for i in range(len(labels)):
        if i < len(comments):
            lexical_view.append(f"{labels[i]}: {comments[i]}")
        else:
            lexical_view.append(f"{labels[i]}: ")
    
    return lexical_view




def extract_local_names(ttl_file_path):
    """
    Extract ontology elements (local names without full URIs).
    Returns:
        - class_names: ontology classes (entities)
        - object_property_names: ontology object properties (relationships)
        - datatype_property_names: ontology datatype properties (attributes)
    """
    g = Graph()
    g.parse(ttl_file_path, format="turtle")
    
    def get_local_name(uri):
        uri_str = str(uri)
        if '#' in uri_str: 
            return uri_str.split('#')[-1]
        return uri_str.split('/')[-1]
    
    # Classes
    class_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.Class))]
    class_names = [get_local_name(uri) for uri in class_uris]
    
    # Object properties
    object_prop_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.ObjectProperty))]
    object_property_names = [get_local_name(uri) for uri in object_prop_uris]
    
    # Datatype properties
    data_prop_uris = [s for s, _, _ in g.triples((None, RDF.type, OWL.DatatypeProperty))]
    datatype_property_names = [get_local_name(uri) for uri in data_prop_uris]
    
    return (
        sorted(class_names),
        sorted(object_property_names),
        sorted(datatype_property_names),
    )




def chunk_document(document, chunk_size=3000, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents([document])




def merge_graph_documents(graph_documents):
    """Simple function to merge all nodes and relationships from graph documents."""

    all_nodes = []
    all_relationships = []
    
    # Collect everything
    for doc in graph_documents:
        all_nodes.extend(doc.nodes)
        all_relationships.extend(doc.relationships)
    
    # Remove duplicate nodes by ID
    seen_nodes = set()
    unique_nodes = []
    for node in all_nodes:
        if node.id not in seen_nodes:
            seen_nodes.add(node.id)
            unique_nodes.append(node)
    
    # Remove duplicate relationships
    seen_rels = set()
    unique_relationships = []
    for rel in all_relationships:
        rel_key = (rel.source.id, rel.target.id, rel.type)
        if rel_key not in seen_rels:
            seen_rels.add(rel_key)
            unique_relationships.append(rel)
    
    # Create merged document
    merged_doc = type(graph_documents[0])(
        nodes=unique_nodes,
        relationships=unique_relationships,
        source=graph_documents[0].source
    )
    
    print(f"Merged: {len(unique_nodes)} nodes, {len(unique_relationships)} relationships")
    return [merged_doc]




def visualize_graph(graph_documents):
    """
    Code from https://github.com/thu-vu92/knowledge-graph-llms/tree/main
    """

    # Create network
    net = Network(height="1200px", width="100%", directed=True,
                      notebook=False, bgcolor="#222222", font_color="white")
    
    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships

    # Build lookup for valid nodes
    node_dict = {node.id: node for node in nodes}
    
    # Filter out invalid edges and collect valid node IDs
    valid_edges = []
    valid_node_ids = set()
    for rel in relationships:
        if rel.source.id in node_dict and rel.target.id in node_dict:
            valid_edges.append(rel)
            valid_node_ids.update([rel.source.id, rel.target.id])


    # Track which nodes are part of any relationship
    connected_node_ids = set()
    for rel in relationships:
        connected_node_ids.add(rel.source.id)
        connected_node_ids.add(rel.target.id)

    # Add valid nodes
    for node_id in valid_node_ids:
        node = node_dict[node_id]
        try:
            net.add_node(node.id, label=node.id, title=node.type, group=node.type)
        except:
            continue  # skip if error

    # Add valid edges
    for rel in valid_edges:
        try:
            net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
        except:
            continue  # skip if error

    # Configure physics
    net.set_options("""
            {
                "physics": {
                    "forceAtlas2Based": {
                        "gravitationalConstant": -100,
                        "centralGravity": 0.01,
                        "springLength": 200,
                        "springConstant": 0.08
                    },
                    "minVelocity": 0.75,
                    "solver": "forceAtlas2Based"
                }
            }
            """)
        
    output_file = "knowledge_graph.html"
    net.save_graph(output_file)
    print(f"Graph saved to {os.path.abspath(output_file)}")

    # Try to open in browser
    try:
        import webbrowser
        webbrowser.open(f"file://{os.path.abspath(output_file)}")
    except:
        print("Could not open browser automatically")




def save_graph_to_csv(graph_documents, output_file="knowledge_graph.csv"):
    """
    Save the knowledge graph to CSV format with structure (object, relation, subject).
    
    Args:
        graph_documents: The graph documents from LLMGraphTransformer
        output_file: Output CSV filename
    """
    relationships = graph_documents[0].relationships
    
    # Extract triples in (object, relation, subject) format
    triples = []
    for rel in relationships:
        triple = {
            'object': rel.source.id,      # source node as object
            'relation': rel.type,         # relationship type
            'subject': rel.target.id      # target node as subject
        }
        triples.append(triple)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(triples)
    df.to_csv(output_file, index=False)
    
    print(f"Graph saved to CSV: {os.path.abspath(output_file)}")
    print(f"Total triples: {len(triples)}")


# ---------------------------------- RUNNING ----------------------------------


classes, relations, attributes = extract_local_names('results/ontologies/rdb/rigor_ontology_few_fixes.ttl')

input_txt_folder = "data/texts"
for filename in os.listdir(input_txt_folder):
    if filename.endswith('opportunity_example_eng.txt'):
        file_path = os.path.join(input_txt_folder, filename)
        print(f"Processing {filename}...")

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
        except Exception as e:
            print(f"Error processing {filename}: {e}")


document = Document(page_content=content)
chunks = chunk_document(document)

graph_transformer = LLMGraphTransformer(llm = gpt41_nano,
                                    allowed_nodes = classes,
                                    allowed_relationships=relations)


graph_documents = await graph_transformer.aconvert_to_graph_documents(chunks)

graph_documents = merge_graph_documents(graph_documents)

Processing opportunity_example_eng.txt...
Merged: 81 nodes, 65 relationships


In [35]:
visualize_graph(graph_documents)

Graph saved to c:\Users\tiago\Documents\Granter Ai Internship\Implementation\KGs_for_Vertical_AI\approaches\02_knowledge_graphs\knowledge_graph.html


In [None]:
def save_graph(graph_documents, output_file="graph.csv"):
    nodes = graph_documents[0].nodes
    relationships = graph_documents[0].relationships
    
    # Create a mapping from node IDs to sequential integers
    node_id_mapping = {}
    sequential_id = 1
    
    for node in nodes:
        if node.id not in node_id_mapping:
            node_id_mapping[node.id] = sequential_id
            sequential_id += 1
    
    # Prepare data for CSV
    csv_data = []
    
    # Add header for nodes section
    csv_data.append("node_id,node_attr")
    
    # Add nodes with their attributes
    for node in nodes:
        node_id = node_id_mapping[node.id]
        # Format: "Type: NodeName" or just "NodeName" if no type
        if hasattr(node, 'type') and node.type:
            node_attr = f"{node.type}: {node.id}"
        else:
            node_attr = node.id
        
        csv_data.append(f'{node_id},"{node_attr}"')
    
    # Add empty line separator
    csv_data.append("")
    
    # Add header for edges section
    csv_data.append("src,edge_attr,dst")
    
    # Add edges
    for rel in relationships:
        src_id = node_id_mapping.get(rel.source.id)
        dst_id = node_id_mapping.get(rel.target.id)
        
        # Skip if either node ID is not found
        if src_id is None or dst_id is None:
            continue
            
        edge_attr = rel.type
        csv_data.append(f'{src_id},"{edge_attr}",{dst_id}')
    
    # Write to file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(csv_data))
    
    print(f"Graph saved to: {os.path.abspath(output_file)}")
    print(f"Total nodes: {len(nodes)}")
    print(f"Total edges: {len(relationships)}")
    print(f"Node ID mapping: {node_id_mapping}")
    
    return node_id_mapping

In [None]:
save_graph(graph_documents)

Graph saved to: c:\Users\tiago\Documents\Granter Ai Internship\Implementation\KGs_for_Vertical_AI\approaches\02_knowledge_graphs\gretriever_graph.csv
Total nodes: 93
Total edges: 70
Node ID mapping: {'Mar2030 -2023 -22': 1, 'Transformação De Produtos Da Pesca E Da Aquicultura Na Região Autónoma Da Madeira': 2, '14/11/2023': 3, 'Concurso': 4, 'Operações': 5, 'Srmar': 6, '25/07/2023': 7, 'Article 50.º Da Portaria N.º 559/2023': 8, 'Região Autónoma Da Madeira': 9, 'Programa Mar 2030': 10, '+ 351 291 203 250': 11, 'Drp@Madeira.Gov.Pt': 12, 'Fundos Europeus Mais Próximos De Si.': 13, 'Promover A Comercialização, A Qualidade, O Valor Acrescentado Dos Produtos Da Pesca E Da Aquicultura, Assim Como A Transformação Destes Produtos.': 14, 'Programa  Mar 2030': 15, '2 - Fomentar Atividades De Aquicultura Sustentáveis E A Transformação E Comercialização De Produtos Da Pesca E Da Aquicultura, Contribuindo Assim Para A Segurança Alimentar Da União': 16, 'Fso2.2': 17, 'Fso2.2 -01-Investimentos Produt

{'Mar2030 -2023 -22': 1,
 'Transformação De Produtos Da Pesca E Da Aquicultura Na Região Autónoma Da Madeira': 2,
 '14/11/2023': 3,
 'Concurso': 4,
 'Operações': 5,
 'Srmar': 6,
 '25/07/2023': 7,
 'Article 50.º Da Portaria N.º 559/2023': 8,
 'Região Autónoma Da Madeira': 9,
 'Programa Mar 2030': 10,
 '+ 351 291 203 250': 11,
 'Drp@Madeira.Gov.Pt': 12,
 'Fundos Europeus Mais Próximos De Si.': 13,
 'Promover A Comercialização, A Qualidade, O Valor Acrescentado Dos Produtos Da Pesca E Da Aquicultura, Assim Como A Transformação Destes Produtos.': 14,
 'Programa  Mar 2030': 15,
 '2 - Fomentar Atividades De Aquicultura Sustentáveis E A Transformação E Comercialização De Produtos Da Pesca E Da Aquicultura, Contribuindo Assim Para A Segurança Alimentar Da União': 16,
 'Fso2.2': 17,
 'Fso2.2 -01-Investimentos Produtivos Na Transformação': 18,
 'Fso2.2.02 - Investimentos Em Eficiência Energética, A Economia Circular, A Digitalização E A Internacionalização': 19,
 'Fso2.2 -01-01-Investimentos Pro