In [1]:
import os
from serka.fetchers import EIDCFetcher
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from neo4j_haystack import Neo4jDocumentStore
from dotenv import load_dotenv

load_dotenv()
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [None]:
from serka.graph.embedders import OllamaNodeEmbedder
from serka.graph.joiners import NodeJoiner
from serka.graph.writers import Neo4jGraphWriter

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("orgs_extractor", OrganisationExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("joiner", NodeJoiner())
p.add_component("node_emb", OllamaNodeEmbedder())
p.add_component(
	"graph_writer", Neo4jGraphWriter(username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
)
p.connect("fetcher", "author_extractor")
p.connect("fetcher", "orgs_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("author_extractor", "joiner.authors")
p.connect("orgs_extractor", "joiner.orgs")
p.connect("dataset_extractor", "joiner.datasets")
p.connect("joiner", "node_emb")
p.connect("node_emb", "graph_writer")

p.run(data={"fetcher": {"rows": 100}})

2025-04-23 15:13:42,064 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-23 15:13:42,128 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-23 15:13:42,129 - haystack.core.pipeline.pipeline - INFO - Running component orgs_extractor
2025-04-23 15:13:42,130 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-23 15:13:42,131 - haystack.core.pipeline.pipeline - INFO - Running component joiner
2025-04-23 15:13:42,132 - haystack.core.pipeline.pipeline - INFO - Running component node_emb
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-23 15:13:47,633 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
Calculating embeddings: 100%|██████████| 1/1 [00:05<00:00,  5.50s/it]
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-23 15:13:47,995 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
Ca

{'graph_writer': {'nodes_created': {'Person': 22,
   'Organisation': 2,
   'Dataset': 10}}}

In [None]:
doc_store = Neo4jDocumentStore(
	url="bolt://localhost:7687",
	username=NEO4J_USERNAME,
	password=NEO4J_PASSWORD,
	index="text-chunk-embeddings",
	embedding_field="embedding",
	database="neo4j",
	embedding_dim=768,
	node_label="TextChunk",
)

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("org_extractor", OrganisationExtractor())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("text_extractor", TextExtractor(["description"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=50, split_overlap=20)
)
p.add_component(
	"embedder",
	OllamaDocumentEmbedder(model="nomic-embed-text", url="http://localhost:11434"),
)
p.add_component("writer", DocumentWriter(doc_store))

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "org_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")
p.connect("text_extractor", "splitter")
p.connect("splitter", "embedder")
p.connect("embedder", "writer")

p.run(data={"fetcher": {"rows": 1}})