In [1]:
import os
from serka.fetchers import EIDCFetcher
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from dotenv import load_dotenv

load_dotenv()
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [2]:
from serka.graph.embedders import OllamaNodeEmbedder
from serka.graph.joiners import NodeJoiner
from serka.graph.writers import Neo4jGraphWriter

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("orgs_extractor", OrganisationExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("text_extractor", TextExtractor(["description", "lineage"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=150, split_overlap=50)
)
p.add_component(
	"doc_emb", OllamaDocumentEmbedder(meta_fields_to_embed=["title", "field"])
)
p.add_component("joiner", NodeJoiner())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("node_emb", OllamaNodeEmbedder())
p.add_component(
	"graph_writer", Neo4jGraphWriter(username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
)

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "orgs_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")

p.connect("author_extractor", "joiner.authors")
p.connect("orgs_extractor", "joiner.orgs")
p.connect("dataset_extractor", "joiner.datasets")

p.connect("text_extractor", "splitter")
p.connect("splitter", "doc_emb")
p.connect("doc_emb", "graph_writer.docs")

p.connect("joiner", "node_emb")
p.connect("node_emb", "graph_writer.nodes")
p.connect("rel_extractor", "graph_writer.relations")

p.run(data={"fetcher": {"rows": 10}})

2025-04-28 15:42:20,344 - haystack.core.pipeline.base - INFO - Warming up component splitter...
2025-04-28 15:42:20,345 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-28 15:42:20,481 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-28 15:42:20,482 - haystack.core.pipeline.pipeline - INFO - Running component orgs_extractor
2025-04-28 15:42:20,483 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-28 15:42:20,484 - haystack.core.pipeline.pipeline - INFO - Running component rel_extractor
2025-04-28 15:42:20,485 - haystack.core.pipeline.pipeline - INFO - Running component text_extractor
2025-04-28 15:42:20,485 - haystack.core.pipeline.pipeline - INFO - Running component joiner
2025-04-28 15:42:20,486 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2025-04-28 15:42:20,487 - haystack.core.pipeline.pipeline - INFO - Running component node_emb
Calculating embeddings:   0

[]


2025-04-28 15:42:22,665 - neo4j.notifications - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.CartesianProduct} {category: PERFORMANCE} {title: This query builds a cartesian product between disconnected patterns.} {description: If a part of a query contains multiple disconnected patterns, this will build a cartesian product between all those parts. This may produce a large amount of data and slow down query processing. While occasionally intended, it may often be possible to reformulate the query that avoids the use of this cross product, perhaps by adding a relationship between the different parts or by using OPTIONAL MATCH (identifier is: (b))} {position: line: 1, column: 31, offset: 30} for query: 'UNWIND $relations as relation MATCH (a), (b) WHERE a.uri = relation[0] AND b.uri = relation[1] MERGE (a)-[:AUTHORED_BY]->(b) RETURN COUNT(*)'
2025-04-28 15:42:22,723 - neo4j.notifications - INFO - Received notification from 

{'doc_emb': {'meta': {'model': 'nomic-embed-text'}},
 'graph_writer': {'nodes_created': {'Person': 22,
   'Organisation': 2,
   'Dataset': 10,
   'Document': 29},
  'relations_created': {'AUTHORED_BY': 35,
   'AFFILIATED_WITH': 9,
   'CONTRIBUTED_TO': 19,
   'DESCRIPTION_OF': 16,
   'LINEAGE_OF': 13}}}

In [3]:
GRAPH_PROMPT = """
# Overview
You are a helpful assistant. 
You have access to a knowledge graph containing information about datasets contained in the EIDC (Environmental Information Data Centre).
You will be given a query inputted by a user, and you need to find the most relevant information in the graph to answer it.
You will be given a list of nodes and relationships from the knowledge graph that are most relevant to the query.
The list of nodes and relationships will be in the format:
```
[{
    'start_node': {}, # Node and it's properties that are relevant to the query
    'start_labels': [], # A list of labels for the start node representing the type of entity it is, e.g. ['Dataset', 'Organisation', 'Person', 'TextChunk']
    'relationship_type': "", # A string representing the type of relationship between the start_node and the connected_node
    'connected_node': {}, # The node the start_node is related to, and it's properties
    'connected_labels': [] # A list of labels for the connected_node representing the type of entity it is, e.g. ['Dataset', 'Organisation', 'Person', 'TextChunk']
    'score': 0.0 # A float representing how relevant the start_node is to the query
}]
```

If there is not enough information to answer the query, you should say that you cannot answer the query, instead provide a set of links to pany potentially relevant sources from the knowledge graph.
If you can answer the question you should provide a short answer to the question, followed by list of URIs from the nodes in the knowledge graph that helped you answer the question.
Your answer should be in the following markdown format:
```
# Query
Who is the author of the smog monitoring dataset?

# Answer
The "Smog Monitoring Dataset"[1] was authored by "John Doe"[2], "Jane Smith"[3], and "Alice Johnson"[4].

# References
[1] https://doi.org/10.1234/a1235-1234-1234-6789-123456789
[2] https://orcid.org/0000-0001-0459-506X
[3] https://orcid.org/0000-0002-1234-306B
[4] https://orcid.org/0000-0003-9876-302C
```

# Relevant Information
The list of most relevant nodes and relationships from the knowledge graph is:
```
{{nodes}}
```

# User Query
The query is: {{query}}
"""

In [6]:
from haystack_integrations.components.embedders.ollama import OllamaTextEmbedder
from serka.graph.readers import Neo4jGraphReader
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.ollama.generator import OllamaGenerator
from haystack.components.builders.answer_builder import AnswerBuilder

query = "What are some datasets available in the EIDC?"

p = Pipeline()

p.add_component("embedder", OllamaTextEmbedder())
p.add_component(
	"reader", Neo4jGraphReader(username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
)
p.add_component("prompt_builder", PromptBuilder(GRAPH_PROMPT))
p.add_component(
	"llm",
	OllamaGenerator(
		model="llama3.1",
		generation_kwargs={"num_ctx": 16384, "temperature": 0.0},
	),
)
p.add_component("answer_builder", AnswerBuilder())

p.connect("embedder", "reader")
p.connect("reader", "prompt_builder.nodes")
p.connect("prompt_builder", "llm")
p.connect("llm.replies", "answer_builder.replies")
p.connect("prompt_builder.prompt", "answer_builder.query")

p.run({"embedder": {"text": query}, "prompt_builder": {"query": query}})

2025-04-28 15:45:46,630 - haystack.core.pipeline.pipeline - INFO - Running component embedder


2025-04-28 15:45:46,656 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-04-28 15:45:46,657 - haystack.core.pipeline.pipeline - INFO - Running component reader
2025-04-28 15:45:46,680 - haystack.core.pipeline.pipeline - INFO - Running component prompt_builder
2025-04-28 15:45:46,680 - haystack.core.pipeline.pipeline - INFO - Running component llm
2025-04-28 15:45:55,890 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
2025-04-28 15:45:55,893 - haystack.core.pipeline.pipeline - INFO - Running component answer_builder


{'embedder': {'meta': {'model': 'nomic-embed-text'}},
 'llm': {'meta': [{'model': 'llama3.1',
    'created_at': '2025-04-28T14:45:55.884403274Z',
    'done': True,
    'done_reason': 'stop',
    'total_duration': 9202234847,
    'load_duration': 19054308,
    'prompt_eval_count': 9154,
    'prompt_eval_duration': 3729632953,
    'eval_count': 283,
    'eval_duration': 5452729648,
    'context': [128006,
     882,
     128007,
     1432,
     2,
     35907,
     198,
     2675,
     527,
     264,
     11190,
     18328,
     13,
     720,
     2675,
     617,
     2680,
     311,
     264,
     6677,
     4876,
     8649,
     2038,
     922,
     30525,
     13282,
     304,
     279,
     469,
     926,
     34,
     320,
     83166,
     8245,
     2956,
     14821,
     4390,
     2675,
     690,
     387,
     2728,
     264,
     3319,
     1988,
     6702,
     555,
     264,
     1217,
     11,
     323,
     499,
     1205,
     311,
     1505,
     279,
     1455,
     9959,
