In [1]:
import os
import requests
from trafilatura import extract
import instructor
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

In [27]:
def fetch_text(url: str) -> str:
    r = requests.get(url)
    if r.status_code == 200:
        html = r.text
        extracted = extract(html)
        return extracted
    else:
        raise Exception(
            f"Failed to fetch the HTML content. Status code: {r.status_code}"
        )


def fetch_diffbot(content):
    fields = "entities,sentiment,facts,records,categories,sentences,language,summary"
    token = os.getenv("DIFFBOT_API_KEY")
    r = requests.post(
        f"https://nl.diffbot.com/v1/?fields={fields}&token={token}",
        json={"content": content, "lang": "en"},
    )
    r.raise_for_status()
    return r.json()

In [28]:
doc = fetch_text("https://arxiv.org/html/1706.03762v7")
parsed = fetch_diffbot(doc)

In [35]:
from google.cloud import language_v2


def sample_analyze_entities(text_content: str) -> None:
    client = language_v2.LanguageServiceClient()
    document_type_in_plain_text = language_v2.Document.Type.PLAIN_TEXT
    language_code = "en"
    document = {
        "content": text_content,
        "type_": document_type_in_plain_text,
        "language_code": language_code,
    }
    encoding_type = language_v2.EncodingType.UTF8

    response = client.analyze_entities(
        request={"document": document, "encoding_type": encoding_type}
    )

    for entity in response.entities:
        print(f"Representative name for the entity: {entity.name}")
        print(f"Entity type: {language_v2.Entity.Type(entity.type_).name}")
        for metadata_name, metadata_value in entity.metadata.items():
            print(f"{metadata_name}: {metadata_value}")

        for mention in entity.mentions:
            print(f"Mention text: {mention.text.content}")
            print(f"Mention type: {language_v2.EntityMention.Type(mention.type_).name}")
            print(f"Probability score: {mention.probability}")
    print(f"Language of the text: {response.language_code}")
    return response

In [38]:
from typing import Any, Dict, List

from pydantic import BaseModel, Field


class Metadata(BaseModel):
    createdDate: str = Field(
        ..., description="The date the knowledge graph was created"
    )
    lastUpdated: str = Field(
        ..., description="The date the knowledge graph was last updated"
    )
    description: str = Field(..., description="Description of the knowledge graph")


class Node(BaseModel):
    id: str = Field(..., description="Unique identifier for the node")
    label: str = Field(..., description="Label for the node")
    type: str = Field(..., description="Type of the node")
    color: str = Field(..., description="Color for the node")
    properties: Dict[str, Any] = Field(
        {}, description="Additional attributes for the node"
    )


class Edge(BaseModel):
    # WARING: Notice that this is "from_", not "from"
    from_: str = Field(..., alias="from", description="Origin node ID")
    to: str = Field(..., description="Destination node ID")
    relationship: str = Field(..., description="Type of relationship between the nodes")
    direction: str = Field(..., description="Direction of the relationship")
    color: str = Field(..., description="Color for the edge")
    properties: Dict[str, Any] = Field(
        {}, description="Additional attributes for the edge"
    )


class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships.
    Use the colors to help differentiate between different node or edge types/categories.
    Always provide light pastel colors that work well with black font.
    """

    metadata: Metadata = Field(..., description="Metadata for the knowledge graph")
    nodes: List[Node] = Field(..., description="List of nodes in the knowledge graph")
    edges: List[Edge] = Field(..., description="List of edges in the knowledge graph")

In [39]:
client = instructor.from_openai(OpenAI())


def generate_graph() -> KnowledgeGraph:
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": f"Help me understand the provided document by describing it as a detailed knowledge graph.\n DOCUMENT:\n----------\n{doc}",
            }
        ],
        response_model=KnowledgeGraph,
    )

In [40]:
kg = generate_graph()

In [41]:
for node in kg.nodes:
    print(node)

id='1' label='Attention Is All You Need' type='Paper' color='#FFDDC1' properties={}
id='2' label='Transformer' type='Model' color='#FFEE93' properties={}
id='3' label='Attention Mechanism' type='Concept' color='#FFABAB' properties={}
id='4' label='Self-Attention' type='Concept' color='#FFABAB' properties={}
id='5' label='Multi-Head Attention' type='Concept' color='#FFABAB' properties={}
id='6' label='Encoder' type='Component' color='#B5EAD7' properties={}
id='7' label='Decoder' type='Component' color='#B5EAD7' properties={}
id='8' label='Scaled Dot-Product Attention' type='Concept' color='#FFABAB' properties={}
id='9' label='Recurrent Neural Networks' type='Model' color='#FFEE93' properties={}
id='10' label='Convolutional Neural Networks' type='Model' color='#FFEE93' properties={}
id='11' label='Sequence Transduction' type='Research Area' color='#BFC0C0' properties={}
id='12' label='Machine Translation' type='Task' color='#BFC0C0' properties={}
id='13' label='English-to-German Translat

In [42]:
for edge in kg.edges:
    print(edge)

from_='1' to='2' relationship='proposes' direction='directed' color='#FFA69E' properties={}
from_='1' to='3' relationship='utilizes' direction='directed' color='#FF686B' properties={}
from_='2' to='4' relationship='relies on' direction='directed' color='#FF686B' properties={}
from_='2' to='5' relationship='includes' direction='directed' color='#FF686B' properties={}
from_='2' to='6' relationship='contains' direction='directed' color='#55CBCD' properties={}
from_='2' to='7' relationship='contains' direction='directed' color='#55CBCD' properties={}
from_='2' to='12' relationship='applied to' direction='directed' color='#A6C4A2' properties={}
from_='2' to='22' relationship='includes' direction='directed' color='#55CBCD' properties={}
from_='3' to='4' relationship='special case of' direction='directed' color='#FF686B' properties={}
from_='3' to='8' relationship='includes' direction='directed' color='#FF686B' properties={}
from_='4' to='5' relationship='enhanced by' direction='directed' col