# Loading a document

This notebook demonstrates the use of the TrustGraph librarian API to submit text for processing.

The API optionally allows extra arbitrary metadata to be submitted with the document which is associated with the document and added to the triple store.  In this example, we add metadata in a form structured in line with schema.org Organization, PublicationEvent and Document schemas.

The additional metadata is optional, the metadata element can be ignored.  However, if additional metadata is known, it can be integrated with TrustGraph processing.

This particular processing uses the following API calls:
- Load a document into the library
- Create a new flow
- Submit the document for processing in that flow

This will fail if the document, flow and flow submission already exist, so don't execute this notebook more than once.

In [1]:
import requests
import json
import base64
import time

In [2]:
# Open a PDF document from this repo.  PDFs are binary blobs
blob = open("../sources/Challenger-Report-Vol1.pdf", "rb").read()

title = "Challenger Report Volume 1"

In [3]:
# ID of flow
flow = "my-flow2"
base_url = "http://localhost:8088"

In [4]:
# URL of the TrustGraph prompt API
url = f"{base_url}/api/v1/librarian"

In [5]:
# Some random identifiers.  The doc ID is important, as extracted knowledge is linked back to this identifier
org_id = "https://trustgraph.ai/org/1dd51ece-8bd3-48b8-98ce-1ac9164c5214"
doc_id = "https://trustgraph.ai/doc/72ef3374-af7a-40c4-8c7b-45050aef5b90"
pub_id = "https://trustgraph.ai/pubev/59012ae1-65d4-441f-8288-b6f3c6c15333"

In [6]:
# Organization metadata
org_facts = [
    [org_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "https://schema.org/Organization"],
    [org_id, "http://www.w3.org/2000/01/rdf-schema#label", "NASA"],
    [org_id, "https://schema.org/name", "NASA"]
]

In [7]:
# Puublication metadata.  Note how it links to the Organization
pub_facts = [
    [pub_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "https://schema.org/PublicationEvent"],
    [pub_id, "https://schema.org/description", "Uploading to Github"],
    [pub_id, "https://schema.org/endDate", "1986-06-06"],
    [pub_id, "https://schema.org/publishedBy", org_id],
    [pub_id, "https://schema.org/startDate", "1986-06-06"]
]

In [8]:
# Document metadata.  Note how it links to the publication event
doc_facts = [
    [doc_id, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "https://schema.org/DigitalDocument"],
    [doc_id, "http://www.w3.org/2000/01/rdf-schema#label", "Challenger Report Volume 1"],
    [doc_id, "https://schema.org/copyrightHolder", "US Government"],
    [doc_id, "https://schema.org/copyrightNotice", "Work of the US Gov. Public Use Permitted"],
    [doc_id, "https://schema.org/copyrightYear", "1986"],
    [doc_id, "https://schema.org/description", "The findings of the Presidential Commission regarding the circumstances surrounding the Challenger accident are reported and recommendations for corrective action are outlined"],
    [doc_id, "https://schema.org/keywords", "nasa"],
    [doc_id, "https://schema.org/keywords", "challenger"],
    [doc_id, "https://schema.org/keywords", "space-shuttle"],
    [doc_id, "https://schema.org/keywords", "shuttle"],
    [doc_id, "https://schema.org/keywords", "orbiter"],
    [doc_id, "https://schema.org/name", "Challenger Report Volume 1"],
    [doc_id, "https://schema.org/publication", pub_id],
    [doc_id, "https://schema.org/url", "https://ntrs.nasa.gov/citations/19860015255"]
]

In [9]:
# Convert the above metadata into the right form
metadata = [
    {
        "s": {
            "v": t[0],
            "e": True,
        },
        "p": {
            "v": t[1],
            "e": True,
        },
        "o": {
            "v": t[2],
            "e": t[2].startswith("http")
        }
    }
    for t in org_facts + pub_facts + doc_facts
]

In [10]:
# The input
input = {

    "operation": "add-document",

    "document-metadata": {

        # Document identifer.  Knowledge derived by TrustGraph is linked to this identifier, so
        # the additional metadata specified above is linked to the derived knowledge and users of
        # the knowledge graph could see information about the source of knowledge
        "id": doc_id,

        "time": int(time.time()),

        "kind": "application/pdf",

        "title": title,

        "comments": "This is some more test text",

        # Additional metadata in the form of RDF triples
        "metadata": metadata,

        "user": "trustgraph",

    },

    # The PDF document, is presented as a base64 encoded document.
    "content": base64.b64encode(blob).decode("utf-8")
    
}

In [11]:
# Invoke the API, input is passed as JSON
resp = requests.post(url, json=input)

In [12]:
# Should be a 200 status code
resp.status_code

200

In [13]:
# The document load returns no response.  A 200 response shows the submitted PDF is queued to enter processing flows

In [14]:
resp.json()

{}

In [15]:
assert("error" not in resp.json())

# Start a flow

In [16]:
# URL of the TrustGraph prompt API
url = f"{base_url}/api/v1/flow"

In [17]:
# The input
input = {

    "operation": "start-flow",

    "flow-id": flow,

    "class-name": "document-rag+graph-rag",

    "description": "My new flow"

}

In [18]:
# Invoke the API, input is passed as JSON
resp = requests.post(url, json=input)

In [19]:
# Should be a 200 status code
resp.status_code

200

In [20]:
resp.json()

{}

In [21]:
assert("error" not in resp.json())

# Submit document for processing

In [22]:
# URL of the TrustGraph prompt API
url = f"{base_url}/api/v1/librarian"

In [23]:
# The input
input = {

    "operation": "add-processing",

    "processing-metadata": {

        "id": "proc02",

        "document-id": doc_id,

        "time": int(time.time()),

        "flow": flow,

        "user": "trustgraph",
        "collection": "default",

        "tags": ["my document", "processing test"],

    }

}

In [24]:
# Invoke the API, input is passed as JSON
resp = requests.post(url, json=input)

In [25]:
# Should be a 200 status code
resp.status_code

200

In [26]:
resp.json()

{}

In [27]:
assert("error" not in resp.json())