<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/llm_graph_speed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet langchain-experimental langchain-openai

In [2]:

import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

··········


In [3]:

import pandas as pd

news = pd.read_csv(
    "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv"
)

In [4]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

llm_transformer = LLMGraphTransformer(
    llm=llm,
    node_properties=["description"],
    relationship_properties=["description"]
)

In [5]:
from langchain_core.documents import Document

documents = [Document(page_content=t) for t in news.text][:20]

In [6]:
%time llm_transformer.convert_to_graph_documents(documents)

CPU times: user 2.27 s, sys: 172 ms, total: 2.45 s
Wall time: 2min 23s


[GraphDocument(nodes=[Node(id='Jhvephoto', type='Company'), Node(id='Chevron', type='Company'), Node(id='Cvx', type='Stock'), Node(id='Q2 Consensus Earnings Estimates', type='Financial metric')], relationships=[Relationship(source=Node(id='Chevron', type='Company'), target=Node(id='Cvx', type='Stock'), type='HAS_STOCK'), Relationship(source=Node(id='Cvx', type='Stock'), target=Node(id='Q2 Consensus Earnings Estimates', type='Financial metric'), type='HAS_METRIC', properties={'description': 'risen sharply (~25%) during the past 90-days'})], source=Document(page_content='JHVEPhoto Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. That allowed the...')),
 GraphDocument(nodes=[Node(id='Firstenergy', type='Organization', properties={'description': 'utilities

In [7]:
import time

# time magic doesnt like async
start = time.time()
async_data = await llm_transformer.aconvert_to_graph_documents(documents)
end = time.time()
print(end - start)

20.44025182723999


In [8]:
async_data

[GraphDocument(nodes=[Node(id='Jhvephoto', type='Entity'), Node(id='Chevron', type='Company'), Node(id='Nyse:Cvx', type='Stock'), Node(id='Q2 Consensus Earnings Estimates', type='Financial_metric')], relationships=[Relationship(source=Node(id='Chevron', type='Company'), target=Node(id='Nyse:Cvx', type='Stock'), type='HAS'), Relationship(source=Node(id='Chevron', type='Company'), target=Node(id='Q2 Consensus Earnings Estimates', type='Financial_metric'), type='HAS')], source=Document(page_content='JHVEPhoto Like many companies in the O&G sector, the stock of Chevron (NYSE:CVX) has declined about 10% over the past 90-days despite the fact that Q2 consensus earnings estimates have risen sharply (~25%) during that same time frame. Over the years, Chevron has kept a very strong balance sheet. That allowed the...')),
 GraphDocument(nodes=[Node(id='Firstenergy', type='Organization', properties={'description': 'utilities provider'}), Node(id='Nyse:Fe', type='Stock_ticker'), Node(id='Rtt News',

In [9]:
from concurrent.futures import ThreadPoolExecutor

MAX_WORKERS = 10

def process_document(text):
    try:
        return llm_transformer.convert_to_graph_documents(
            [text]
        )
    except Exception as e:
        print(f"Error processing document with ID {text['id']}: {e}")
        return []

def multi_thread():
    graph_documents = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submitting all tasks and creating a list of future objects
        futures = [executor.submit(process_document, text) for text in documents]

        for future in futures:
            graph_document = future.result()
            graph_documents.extend(graph_document)

%time multi_thread()

CPU times: user 1.63 s, sys: 87.7 ms, total: 1.72 s
Wall time: 23.3 s
