In [4]:
import os
from serka.fetchers import EIDCFetcher
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from neo4j_haystack import Neo4jDocumentStore
from dotenv import load_dotenv

load_dotenv()
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

doc_store = Neo4jDocumentStore(
	url="bolt://localhost:7687",
	username=NEO4J_USERNAME,
	password=NEO4J_PASSWORD,
	index="text-chunk-embeddings",
	embedding_field="embedding",
	database="neo4j",
	embedding_dim=768,
	node_label="TextChunk",
)

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("org_extractor", OrganisationExtractor())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("text_extractor", TextExtractor(["description"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=50, split_overlap=20)
)
p.add_component(
	"embedder",
	OllamaDocumentEmbedder(model="nomic-embed-text", url="http://localhost:11434"),
)
p.add_component("writer", DocumentWriter(doc_store))

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "org_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")
p.connect("text_extractor", "splitter")
p.connect("splitter", "embedder")
p.connect("embedder", "writer")

p.run(data={"fetcher": {"rows": 1}})

2025-04-22 15:19:46,532 - haystack.core.pipeline.base - INFO - Warming up component splitter...
2025-04-22 15:19:46,533 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-22 15:19:46,662 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-22 15:19:46,663 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-22 15:19:46,663 - haystack.core.pipeline.pipeline - INFO - Running component org_extractor
2025-04-22 15:19:46,663 - haystack.core.pipeline.pipeline - INFO - Running component rel_extractor
2025-04-22 15:19:46,664 - haystack.core.pipeline.pipeline - INFO - Running component text_extractor
2025-04-22 15:19:46,664 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2025-04-22 15:19:46,665 - haystack.core.pipeline.pipeline - INFO - Running component embedder
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-22 15:19:47,712 - httpx - INFO - HTTP Request: POST

{'author_extractor': {'authors': [{'forename': 'C.',
    'surname': 'Marston',
    'uri': 'https://orcid.org/0000-0002-2070-2187'},
   {'forename': 'C.S.',
    'surname': 'Rowland',
    'uri': 'https://orcid.org/0000-0002-0459-506X'},
   {'forename': 'A.W.',
    'surname': "O'Neil",
    'uri': 'https://orcid.org/0000-0003-3591-1034'},
   {'forename': 'R.D.',
    'surname': 'Morton',
    'uri': 'https://orcid.org/0000-0003-3947-6463'}]},
 'dataset_extractor': {'datasets': [{'uri': 'https://doi.org/10.5285/abe1f414-6168-4e04-9dc9-4a658a3136ca',
    'title': 'Land Cover Map 2021 (land parcels, N. Ireland)'}]},
 'org_extractor': {'organisations': [{'name': 'UK Centre for Ecology & Hydrology',
    'uri': 'https://ror.org/00pggkr55'},
   {'name': 'NERC EDS Environmental Information Data Centre',
    'uri': 'https://ror.org/04xw4m193'}]},
 'rel_extractor': {'relationships': {'AUTHORED_BY': [('https://doi.org/10.5285/abe1f414-6168-4e04-9dc9-4a658a3136ca',
     'https://orcid.org/0000-0002-2070