In [1]:
import os
from serka.fetchers import EIDCFetcher
from serka.graph.extractors import (
	AuthorExtractor,
	DatasetExtractor,
	OrganisationExtractor,
	RelationshipExtractor,
	TextExtractor,
)
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.ollama import OllamaDocumentEmbedder
from neo4j_haystack import Neo4jDocumentStore
from dotenv import load_dotenv

load_dotenv()
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [None]:
from serka.graph.embedders import OllamaNodeEmbedder
from serka.graph.joiners import NodeJoiner
from serka.graph.writers import Neo4jGraphWriter

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("orgs_extractor", OrganisationExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("text_extractor", TextExtractor(["description", "lineage"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=150, split_overlap=50)
)
p.add_component("doc_emb", OllamaDocumentEmbedder())
p.add_component("joiner", NodeJoiner())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("node_emb", OllamaNodeEmbedder())
p.add_component(
	"graph_writer", Neo4jGraphWriter(username=NEO4J_USERNAME, password=NEO4J_PASSWORD)
)

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "orgs_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")

p.connect("author_extractor", "joiner.authors")
p.connect("orgs_extractor", "joiner.orgs")
p.connect("dataset_extractor", "joiner.datasets")

p.connect("text_extractor", "splitter")
p.connect("splitter", "doc_emb")
p.connect("doc_emb", "graph_writer.docs")

p.connect("joiner", "node_emb")
p.connect("node_emb", "graph_writer.nodes")
p.connect("rel_extractor", "graph_writer.relations")

p.run(data={"fetcher": {"rows": 1}})

2025-04-25 11:26:06,446 - haystack.core.pipeline.base - INFO - Warming up component splitter...
2025-04-25 11:26:06,447 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-25 11:26:06,670 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-25 11:26:06,675 - haystack.core.pipeline.pipeline - INFO - Running component orgs_extractor
2025-04-25 11:26:06,681 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-25 11:26:06,687 - haystack.core.pipeline.pipeline - INFO - Running component rel_extractor
2025-04-25 11:26:06,693 - haystack.core.pipeline.pipeline - INFO - Running component text_extractor
2025-04-25 11:26:06,696 - haystack.core.pipeline.pipeline - INFO - Running component joiner
2025-04-25 11:26:06,698 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2025-04-25 11:26:06,714 - haystack.core.pipeline.pipeline - INFO - Running component node_emb
Calculating embeddings:   0

{'doc_emb': {'meta': {'model': 'nomic-embed-text'}},
 'graph_writer': {'nodes_created': {'Person': 215,
   'Organisation': 57,
   'Dataset': 100,
   'Document': 305},
  'relations_created': {'AUTHORED_BY': 318,
   'AFFILIATED_WITH': 182,
   'CONTRIBUTED_TO': 410,
   'DESCRIPTION_OF': 139,
   'LINEAGE_OF': 166}}}

In [None]:
EIDCFetcher().run(rows=10)

{'records': [{'publicationDate': '2022-08-02T00:00:00.000+00:00',
   'authorAffiliation': ['UK Centre for Ecology & Hydrology'],
   'authorGivenName': ['C.', 'C.S.', 'A.W.', 'R.D.'],
   'authorFamilyName': ['Marston', 'Rowland', "O'Neil", 'Morton'],
   'authorFullName': ['Marston, C.',
    'Rowland, C.S.',
    "O'Neil, A.W.",
    'Morton, R.D.'],
   'authorOrcid': ['https://orcid.org/0000-0002-2070-2187',
    'https://orcid.org/0000-0002-0459-506X',
    'https://orcid.org/0000-0003-3591-1034',
    'https://orcid.org/0000-0003-3947-6463'],
   'authorRor': ['https://ror.org/00pggkr55'],
   'catalogue': 'eidc',
   'description': 'This is a 10m pixel data set representing the land surface of Northern Ireland, classified into 21 UKCEH land cover classes, based upon Biodiversity Action Plan broad habitats. It is a two-band raster in GeoTiff format. The first band gives the most likely land cover type; the second band gives the per-parcel probability of the land cover. A full description of t

In [5]:
doc_store = Neo4jDocumentStore(
	url="bolt://localhost:7687",
	username=NEO4J_USERNAME,
	password=NEO4J_PASSWORD,
	index="text-chunk-embeddings",
	embedding_field="embedding",
	database="neo4j",
	embedding_dim=768,
	node_label="TextChunk",
)

p = Pipeline()
p.add_component("fetcher", EIDCFetcher())
p.add_component("author_extractor", AuthorExtractor())
p.add_component("dataset_extractor", DatasetExtractor())
p.add_component("org_extractor", OrganisationExtractor())
p.add_component("rel_extractor", RelationshipExtractor())
p.add_component("text_extractor", TextExtractor(["description"]))
p.add_component(
	"splitter", DocumentSplitter(split_by="word", split_length=50, split_overlap=20)
)
p.add_component(
	"embedder",
	OllamaDocumentEmbedder(model="nomic-embed-text", url="http://localhost:11434"),
)
p.add_component("writer", DocumentWriter(doc_store))

p.connect("fetcher", "author_extractor")
p.connect("fetcher", "dataset_extractor")
p.connect("fetcher", "org_extractor")
p.connect("fetcher", "rel_extractor")
p.connect("fetcher", "text_extractor")
p.connect("text_extractor", "splitter")
p.connect("splitter", "embedder")
p.connect("embedder", "writer")

p.run(data={"fetcher": {"rows": 1}})

2025-04-24 10:08:20,849 - haystack.core.pipeline.base - INFO - Warming up component splitter...
2025-04-24 10:08:20,850 - haystack.core.pipeline.pipeline - INFO - Running component fetcher
2025-04-24 10:08:20,958 - haystack.core.pipeline.pipeline - INFO - Running component author_extractor
2025-04-24 10:08:20,958 - haystack.core.pipeline.pipeline - INFO - Running component dataset_extractor
2025-04-24 10:08:20,959 - haystack.core.pipeline.pipeline - INFO - Running component org_extractor
2025-04-24 10:08:20,959 - haystack.core.pipeline.pipeline - INFO - Running component rel_extractor
2025-04-24 10:08:20,960 - haystack.core.pipeline.pipeline - INFO - Running component text_extractor
2025-04-24 10:08:20,960 - haystack.core.pipeline.pipeline - INFO - Running component splitter
2025-04-24 10:08:20,961 - haystack.core.pipeline.pipeline - INFO - Running component embedder
Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]2025-04-24 10:08:21,004 - httpx - INFO - HTTP Request: POST

{'author_extractor': {'authors': [{'forename': 'C.',
    'surname': 'Marston',
    'uri': 'https://orcid.org/0000-0002-2070-2187'},
   {'forename': 'C.S.',
    'surname': 'Rowland',
    'uri': 'https://orcid.org/0000-0002-0459-506X'},
   {'forename': 'A.W.',
    'surname': "O'Neil",
    'uri': 'https://orcid.org/0000-0003-3591-1034'},
   {'forename': 'R.D.',
    'surname': 'Morton',
    'uri': 'https://orcid.org/0000-0003-3947-6463'}]},
 'dataset_extractor': {'datasets': [{'uri': 'https://doi.org/10.5285/e44ae9bd-fa32-4aab-9524-fbb11d34a20a',
    'title': 'Land Cover Map 2021 (10m classified pixels, N. Ireland)'}]},
 'org_extractor': {'organisations': [{'name': 'UK Centre for Ecology & Hydrology',
    'uri': 'https://ror.org/00pggkr55'},
   {'name': 'NERC EDS Environmental Information Data Centre',
    'uri': 'https://ror.org/04xw4m193'}]},
 'rel_extractor': {'relationships': {'AUTHORED_BY': [('https://doi.org/10.5285/e44ae9bd-fa32-4aab-9524-fbb11d34a20a',
     'https://orcid.org/0000-