In [None]:
#%pip install --upgrade --quiet  langchain langchain-core langchain-experimental langchain-openai neo4j wikipedia

#### Initiate environment and set example query

In [1]:
import dotenv
import os
import requests
import json
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer

In [2]:
dotenv.load_dotenv()
DIFFBOT_API_KEY = os.environ['DIFFBOT_API_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [3]:
DIFFBOT_NLP = DiffbotGraphTransformer(diffbot_api_key=DIFFBOT_API_KEY)
FIELDS = "entities, sentiment, facts"
HOST = "nl.diffbot.com"
API = "analyze"
TARGET = "https://www.dell.com/en-us/dt/corporate/about-us/leadership/michael-dell.htm"

In [25]:
from langchain_core.documents import Document

def transform_diffbot_to_langchain(diffbot_json):
    if not diffbot_json.get('objects'):
        print("ERROR: No Objects in Diffbot Return")
        return []

    first_object = diffbot_json['objects'][0]
    page_content = first_object.get('text', '')

    metadata = {
        'source': diffbot_json['request']['pageUrl'],
        'sentiment': first_object.get('sentiment'),
        'entities': [],
        'categories': [],
        'tags': [],
        'images': [],
    }

    if 'tags' in first_object:
        for tag in first_object['tags']:
            metadata['entities'].append({
                'label': tag['label'],
                'uri': tag['uri'],
                'sentiment': tag['sentiment'],
                'rdfTypes': tag.get('rdfTypes', [])
            })

    if 'categories' in first_object:
        for category in first_object['categories']:
            metadata['categories'].append({
                'name': category['name'],
                'score': category['score'],
                'id': category.get('id', '')
            })

    if 'images' in first_object:
        metadata['images'] = [img['url'] for img in first_object['images']]

    # Create a Document instance
    langchain_document = [Document(
        page_content=page_content,
        metadata=metadata,
    )]

    return langchain_document

# transformed_document = transform_diffbot_to_langchain(diffbot_json)


In [22]:
def get_data(target=None, api=None):
    api = api or API # null coalescing in case no API specified, default to analyze
    target = target or TARGET
    res = requests.get(f"https://api.diffbot.com/v3/{api}?url={target}&token={DIFFBOT_API_KEY}")
    ret = None
    try:
        ret = res.json()
    except:
        print("Bad response: " + res.text)
        print(res.status_code)
        print(res.headers)
    return ret

In [23]:
diffbot_json = get_data(TARGET, 'article')


In [26]:
raw_documents = transform_diffbot_to_langchain(diffbot_json)
print(raw_documents)

[Document(page_content="Michael Dell is chairman and chief executive officer of Dell Technologies, an innovator and technology leader providing the essential infrastructure for organizations to build their digital future, transform IT and protect their most important information.\nWith revenue of more than $102B, Dell Technologies is one of the world’s largest IT companies serving the needs of global corporations and governments to small businesses and consumers. Dell Technologies’ commitment to ethics and privacy is a source of pride and inspiration within the company. Dell has been recognized as one of the World’s Most Ethical Companies by the Ethisphere Institute, and has also been recognized by Fortune as a Most Admired Company and Best Place to Work, by Forbes as one of America’s Best Employers For Women and by Newsweek as a Most Loved Workplace.\nMichael’s story started when he founded Dell Technologies with $1000 in 1984 at the age of 19. Notably quoted as saying that “technolog

In [27]:
graph_documents = DIFFBOT_NLP.convert_to_graph_documents(raw_documents)

In [28]:
print(graph_documents)

[GraphDocument(nodes=[Node(id='http://www.wikidata.org/entity/Q218630', type='Person', properties={'name': 'Michael Dell', 'positionHeld': 'chairman', 'age': '19'}), Node(id='http://www.wikidata.org/entity/Q913740', type='Organization', properties={'name': 'Catalyst'}), Node(id='http://www.wikidata.org/entity/Q258', type='Location', properties={'name': 'South Africa'}), Node(id='http://www.wikidata.org/entity/Q1622260', type='Organization', properties={'name': 'United Nations Foundation'}), Node(id='http://www.wikidata.org/entity/Q16955', type='Organization', properties={'name': 'Tsinghua University'}), Node(id='http://www.wikidata.org/entity/Q668', type='Location', properties={'name': 'India'}), Node(id='http://www.wikidata.org/entity/Q30873', type='Organization', properties={'name': 'Dell', 'foundingDate': '1984'}), Node(id='Susan Dell', type='Person', properties={'name': 'Susan Dell'}), Node(id='http://www.wikidata.org/entity/Q956', type='Location', properties={'name': 'Beijing'}), 

In [None]:
# observe raw doc retrieval from wiki
print(raw_documents)

#### Load a docker container to explore graph via GUI

In [None]:
# get a docker going via CLI
# docker run \
#    --name neo4j \
#    -p 7474:7474 -p 7687:7687 \
#    -d \
#    -e NEO4J_AUTH=neo4j/pleaseletmein \
#    -e NEO4J_PLUGINS=\[\"apoc\"\]  \
#    neo4j:latest

In [None]:
from langchain_community.graphs import Neo4jGraph

# if running docker in WSL, command wsl hostname -I to get WSL IP
# neo4j gui port is 7474
docker_ip = 'localhost'
url = f"bolt://{docker_ip}:7687"
username = "neo4j"
password = "pleaseletmein"

graph = Neo4jGraph(url=url, username=username, password=password)

Load GraphDocuments into the docker knowledge graph then refresh graph schema

In [None]:
graph.add_graph_documents(graph_documents)

In [None]:
graph.refresh_schema()

In [None]:
print(graph.schema)

In [None]:
# test a graph query manually to verify connectivity before burning OpenAI calls
graph.query(
    """
MATCH p=()-[r:CHIEF_EXECUTIVE_OFFICER]->() RETURN p LIMIT 25
"""
)

Log into Neo4J by navigating to http://\{docker ip}/7474 and logging in with credentials set above. To view the graph, you can run a simple query of `MATCH (n) RETURN n LIMIT 25`

#### Query the graph

In [None]:
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

chain = GraphCypherQAChain.from_llm(
    cypher_llm=ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview"),
    qa_llm=ChatOpenAI(temperature=.3, model_name="gpt-3.5-turbo"),
    graph=graph,
    verbose=True,
)

In [None]:
chain.run("Does Elon Musk have kids?")
