In [2]:
import os
from serka.fetchers import EIDCFetcher
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from neo4j_haystack import Neo4jDocumentStore
from dotenv import load_dotenv

load_dotenv()
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

doc_store = Neo4jDocumentStore(
	url="bolt://localhost:7687",
	username=NEO4J_USERNAME,
	password=NEO4J_PASSWORD,
	index="text-chunk-embeddings",
	embedding_field="embedding",
	database="neo4j",
	embedding_dim=768,
	node_label="TextChunk",
)

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("org_extractor", OrganisationExtractor())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("text_extractor", TextExtractor(["description"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=50, split_overlap=20)
)
p.add_component(
	"embedder",
	OllamaDocumentEmbedder(model="nomic-embed-text", url="http://localhost:11434"),
)
p.add_component("writer", DocumentWriter(doc_store))

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "org_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")
p.connect("text_extractor", "splitter")
p.connect("splitter", "embedder")
p.connect("embedder", "writer")

p.run(data={"fetcher": {"rows": 1}})

2025-04-23 09:12:30,360 - haystack.core.pipeline.base - INFO - Warming up component splitter...
2025-04-23 09:12:30,363 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-23 09:12:30,422 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-23 09:12:30,425 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-23 09:12:30,427 - haystack.core.pipeline.pipeline - INFO - Running component org_extractor
2025-04-23 09:12:30,429 - haystack.core.pipeline.pipeline - INFO - Running component rel_extractor
2025-04-23 09:12:30,431 - haystack.core.pipeline.pipeline - INFO - Running component text_extractor
2025-04-23 09:12:30,433 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2025-04-23 09:12:30,436 - haystack.core.pipeline.pipeline - INFO - Running component embedder
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-23 09:12:39,404 - httpx - INFO - HTTP Request: POST

{'author_extractor': {'authors': [{'forename': 'C.',
    'surname': 'Marston',
    'uri': 'https://orcid.org/0000-0002-2070-2187'},
   {'forename': 'C.S.',
    'surname': 'Rowland',
    'uri': 'https://orcid.org/0000-0002-0459-506X'},
   {'forename': 'A.W.',
    'surname': "O'Neil",
    'uri': 'https://orcid.org/0000-0003-3591-1034'},
   {'forename': 'R.D.',
    'surname': 'Morton',
    'uri': 'https://orcid.org/0000-0003-3947-6463'}]},
 'dataset_extractor': {'datasets': [{'uri': 'https://doi.org/10.5285/e44ae9bd-fa32-4aab-9524-fbb11d34a20a',
    'title': 'Land Cover Map 2021 (10m classified pixels, N. Ireland)'}]},
 'org_extractor': {'organisations': [{'name': 'UK Centre for Ecology & Hydrology',
    'uri': 'https://ror.org/00pggkr55'},
   {'name': 'NERC EDS Environmental Information Data Centre',
    'uri': 'https://ror.org/04xw4m193'}]},
 'rel_extractor': {'relationships': {'AUTHORED_BY': [('https://doi.org/10.5285/e44ae9bd-fa32-4aab-9524-fbb11d34a20a',
     'https://orcid.org/0000-

In [1]:
from serka.graph.embedders import OllamaNodeEmbedder
from serka.fetchers import EIDCFetcher
from serka.graph.joiners import NodeJoiner
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("orgs_extractor", OrganisationExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("joiner", NodeJoiner())
p.add_component("node_emb", OllamaNodeEmbedder())

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "orgs_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("author_extractor", "joiner.authors")
p.connect("orgs_extractor", "joiner.orgs")
p.connect("dataset_extractor", "joiner.datasets")
p.connect("joiner", "node_emb")

p.run(data={"fetcher": {"rows": 1}})

2025-04-23 11:33:24,480 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-23 11:33:24,514 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-23 11:33:24,515 - haystack.core.pipeline.pipeline - INFO - Running component orgs_extractor
2025-04-23 11:33:24,516 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-23 11:33:24,518 - haystack.core.pipeline.pipeline - INFO - Running component joiner
2025-04-23 11:33:24,519 - haystack.core.pipeline.pipeline - INFO - Running component node_emb
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-23 11:33:25,548 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
Calculating embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-23 11:33:25,884 - httpx - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
Ca

{'node_emb': {'node_embeddings': {'Person': [{'forename': 'C.',
     'surname': 'Marston',
     'uri': 'https://orcid.org/0000-0002-2070-2187',
     'embedding': [0.011244291,
      -0.0005762915,
      -0.1906709,
      -0.06378972,
      -0.0025900851,
      0.014205662,
      -0.013065181,
      -0.0073776995,
      -0.026925655,
      -0.032691084,
      -0.094102345,
      -0.004674986,
      0.053881355,
      0.0065762424,
      -1.2002888e-05,
      -0.021469064,
      0.02688503,
      -0.05555874,
      0.021485388,
      -0.02605809,
      -0.040760238,
      -0.036669914,
      0.050697885,
      0.03890607,
      0.09657331,
      -0.017475653,
      0.050466415,
      0.028966412,
      -0.008917176,
      -0.038961355,
      0.013268768,
      0.041216142,
      -0.049912766,
      -0.04070744,
      -0.06864177,
      -0.036028042,
      0.013568889,
      0.040504612,
      -0.027537897,
      0.0077529037,
      -0.0022356752,
      -0.010646258,
      0.009515752,
  