In [1]:
import pandas as pd

pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)

In [2]:
from dotenv import load_dotenv

from autoflow.knowledge_graph.programs.extract_graph import (
    KnowledgeGraphExtractor,
    KnowledgeGraphOutput,
)
from autoflow.models.llms import LLM
from autoflow.models.llms.dspy import get_dspy_lm_by_llm

load_dotenv()

llm = LLM()
dspy_lm = get_dspy_lm_by_llm(llm)
extractor = KnowledgeGraphExtractor(dspy_lm=dspy_lm)

In [3]:
from pathlib import Path

text = Path("../tests/fixtures/tidb-overview.md").read_text()
actual_knowledge = extractor.forward(text)
df = actual_knowledge.to_pandas()

In [4]:
df["entities"]

Unnamed: 0,name,description
0,TiDB,"TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. It is MySQL compatible and features horizontal scalability, strong consistency, and high availability. TiDB provides users with a one-stop database solution covering OLTP, OLAP, and HTAP services."
1,TiDB Self-Managed,TiDB Self-Managed is a product option of TiDB that allows users or organizations to deploy and manage TiDB on their own infrastructure with complete flexibility. Users can enjoy the power of open-source distributed SQL while retaining full control over their environment.
2,Hybrid Transactional and Analytical Processing (HTAP),HTAP is a database processing technique that enables both transactional and analytical workloads to be performed on the same database system without the need for separate systems or data duplication.
3,OLTP,"OLTP (Online Transactional Processing) is a class of systems that facilitate and manage transaction-oriented applications, typically for data entry and retrieval transactions in a database."
4,OLAP,OLAP (Online Analytical Processing) is a category of software tools that provide analysis of data for business decisions. It allows users to analyze multidimensional data interactively from multiple perspectives.
5,TiKV,TiKV is a row-based storage engine provided by TiDB for storing data in a distributed manner.
6,TiFlash,"TiFlash is a columnar storage engine provided by TiDB for real-time replication of data from TiKV, ensuring consistent data between row-based and columnar storage."
7,TiDB Operator,"TiDB Operator is a tool that helps manage TiDB on Kubernetes and automates tasks related to operating the TiDB cluster, making TiDB easier to deploy on any cloud that provides managed Kubernetes."
8,TiDB Cloud,"TiDB Cloud is a fully-managed TiDB service that provides the easiest, most economical, and most resilient way to deploy and run TiDB clusters in the cloud."
9,MySQL Protocol,"The MySQL Protocol is the protocol used by MySQL database systems for communication between clients and servers. TiDB is compatible with the MySQL protocol, allowing seamless integration with the MySQL ecosystem."


In [5]:
df["relationships"]

Unnamed: 0,source_entity,relationship_desc,target_entity
0,TiDB,TiDB Self-Managed is a product option of TiDB that allows users or organizations to deploy and manage TiDB on their own infrastructure with complete flexibility.,TiDB Self-Managed
1,TiDB,"TiDB provides users with a one-stop database solution covering OLTP, OLAP, and HTAP services.",OLTP
2,TiDB,"TiDB provides users with a one-stop database solution covering OLTP, OLAP, and HTAP services.",OLAP
3,TiDB,"TiDB provides users with a one-stop database solution covering OLTP, OLAP, and HTAP services.",Hybrid Transactional and Analytical Processing (HTAP)
4,TiDB,"TiDB provides two storage engines: TiKV, a row-based storage engine, and TiFlash, a columnar storage engine.",TiKV
5,TiDB,"TiDB provides two storage engines: TiKV, a row-based storage engine, and TiFlash, a columnar storage engine.",TiFlash
6,TiDB,"TiDB is compatible with the MySQL protocol, allowing seamless integration with the MySQL ecosystem.",MySQL Protocol
7,TiDB Cloud,"TiDB Cloud is a fully-managed TiDB service that provides the easiest, most economical, and most resilient way to deploy and run TiDB clusters in the cloud.",TiDB
8,TiDB Operator,"TiDB Operator helps manage TiDB on Kubernetes and automates tasks related to operating the TiDB cluster, making TiDB easier to deploy on any cloud that provides managed Kubernetes.",TiDB


In [6]:
expected_knowledge = KnowledgeGraphOutput(
    entities=[
        {"name": "TiDB", "description": "A distributed SQL database"},
        {"name": "TiKV", "description": "TiKV is a row-based storage engine."},
        {"name": "TiFlash", "description": "TiFlash is a column-based storage engine."},
        {
            "name": "OLTP",
            "description": "OLTP is a type of database that is optimized for transaction processing.",
        },
        {
            "name": "OLAP",
            "description": "OLAP is a type of database that is optimized for analytics.",
        },
        {
            "name": "HTAP",
            "description": "HTAP is a type of database that is optimized for both transaction processing and analytics.",
        },
        {
            "name": "TiDB Self-Managed",
            "description": "TiDB Self-Managed is a product option of TiDB.",
        },
        {
            "name": "TiDB Cloud",
            "description": "TiDB Cloud is a fully-managed TiDB service.",
        },
        {
            "name": "TiDB Operator",
            "description": "TiDB Operator is a tool that helps manage TiDB on Kubernetes.",
        },
    ],
    relationships=[
        {
            "source_entity": "TiDB",
            "target_entity": "TiKV",
            "relationship_desc": "TiDB uses TiKV as its storage engine",
        },
        {
            "source_entity": "TiDB",
            "target_entity": "TiFlash",
            "relationship_desc": "TiDB uses TiFlash as its analytics engine",
        },
        {
            "source_entity": "TiDB",
            "target_entity": "OLTP",
            "relationship_desc": "TiDB supports OLTP workloads",
        },
        {
            "source_entity": "TiDB",
            "target_entity": "OLAP",
            "relationship_desc": "TiDB supports OLAP workloads",
        },
        {
            "source_entity": "TiDB",
            "target_entity": "HTAP",
            "relationship_desc": "TiDB supports HTAP workloads",
        },
        {
            "source_entity": "TiDB Self-Managed",
            "target_entity": "TiDB",
            "relationship_desc": "TiDB Self-Managed is a product option of TiDB",
        },
        {
            "source_entity": "TiDB Cloud",
            "target_entity": "TiDB",
            "relationship_desc": "TiDB Cloud is a fully-managed TiDB service.",
        },
        {
            "source_entity": "TiDB Operator",
            "target_entity": "TiDB Cloud",
            "relationship_desc": "TiDB Operator is a tool that helps manage TiDB on Kubernetes.",
        },
    ],
)

In [7]:
from autoflow.knowledge_graph.programs.eval_graph import KnowledgeGraphEvaluator

evaluator = KnowledgeGraphEvaluator(dspy_lm=dspy_lm)

evaluation_result = evaluator.forward(expected_knowledge, actual_knowledge)
evaluation_result.score

0.6111111111111112

In [9]:
from autoflow.knowledge_graph.programs.extract_covariates import (
    EntityCovariateExtractor,
)

metadata_extractor = EntityCovariateExtractor(dspy_lm=dspy_lm)
actual_knowledge.entities = metadata_extractor.forward(text, actual_knowledge.entities)

for entity in actual_knowledge.entities:
    print(entity.name, entity.metadata)

ValueError: "EntityOutput" object has no field "metadata"