In [4]:
# pip install openai==0.28


## Load Libraries and API Configurations

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Neo4jVector
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
import pandas as pd
from langchain.graphs import Neo4jGraph
from langchain.vectorstores import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import openai
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience

url = "neo4j+s://483c47f7.databases.neo4j.io"
username = "neo4j"
password = ""

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

openai_api_key = ""
openai.api_key = openai_api_key


os.environ['OPENAI_API_KEY'] = openai_api_key

## Load the Text

In [None]:
# Reference Link:  https://sec-api.io/resources/extract-textual-data-from-edgar-10-k-filings-using-python
# Load 1 section

SEC_API_KEY = ''
from sec_api import ExtractorApi

extractorApi = ExtractorApi(SEC_API_KEY)

# helper function to pretty print long, single-line text to multi-line text
def pprint(text, line_length=100):
  words = text.split(' ')
  lines = []
  current_line = ''
  for word in words:
    if len(current_line + ' ' + word) <= line_length:
      current_line += ' ' + word
    else:
      lines.append(current_line.strip())
      current_line = word
  if current_line:
    lines.append(current_line.strip())
  print('\n'.join(lines))

# URL of Tesla's 10-K filing
filing_10_k_url = 'https://www.sec.gov/Archives/edgar/data/1318605/000156459021004599/tsla-10k_20201231.htm'

# extract text section "Item 1 - Business" from 10-K
item_1_text = extractorApi.get_section(filing_10_k_url, '1', 'text')
t = pprint(item_1_text[0:1000])
t

In [None]:
# Load several sections
item_1_text    = extractorApi.get_section(filing_10_k_url, '1', 'text')
item_2_text    = extractorApi.get_section(filing_10_k_url, '2', 'text')
item_3_text    = extractorApi.get_section(filing_10_k_url, '3', 'text')

In [None]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata if metadata is not None else {}

# Now create Document instances and pass an empty dictionary for metadata
doc1 = Document(page_content=item_1_text, metadata={})
doc2 = Document(page_content=item_2_text, metadata={})
doc3 = Document(page_content=item_3_text, metadata={})

# Combine the documents into a list
combined_docs = [doc1, doc2, doc3]

# Use the text splitter
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=20)
docs = text_splitter.split_documents(combined_docs)



## Create a Graph Index

In [None]:
graph.query("""
CALL db.index.vector.createNodeIndex(
    'TeslaEmbeddings', //index name
    'Chunk', //node label
    'tesla_embedding', //property name
    1536, //vector size
    'cosine' //similarity metric
)
""")

## Use new index to store embeddings

In [None]:
help(Neo4jVector.from_documents)

In [None]:
hybrid_db = Neo4jVector.from_documents(
    docs, 
    OpenAIEmbeddings(), 
    url=url, 
    username=username, 
    password=password,
    index_name = "TeslaEmbeddings",
    search_type="hybrid"
)

## Use new index to store embeddings

In [None]:
index_name = "TeslaEmbeddings"  # default index name

store = Neo4jVector.from_existing_index(
    OpenAIEmbeddings(),
    url=url,
    username=username,
    password=password,
    index_name=index_name,
)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=store.as_retriever()
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0), store.as_retriever(), memory=memory)

In [None]:
print(qa({"question": "What are the topics covered in these documents?"})["answer"])

In [None]:
legal = qa({"question": "What legal proceedings and potential penalties are discussed in these documents?"})["answer"]
print(legal)

In [None]:
business_overview = qa({"question": "Provide a business overview."})["answer"]
print(business_overview)

In [None]:
regulatory = qa({"question": "Which regulatory impacts and changes are discussed?"})["answer"]
print(regulatory)

In [None]:
competition = qa({"question": "What is discussed regarding competition?  Who are the competitors?"})["answer"]
print(competition)

## Update Neo4j

In [None]:
q = """
MERGE (t:Company {company_name: 'Tesla'})
WITH t

MATCH (c:Chunk)
WITH c,t

MERGE (c)<-[l:EMBEDDING]-(t)
RETURN count(l)

"""
graph.query(q)

In [None]:
q = f"""
MATCH (t:Company)
WHERE t.company_name = 'Tesla'
WITH t

MERGE (legal:Legal)
WITH legal, t

MERGE (legal)<-[l:LEGAL_DESCRIPTION {{description: "{legal}"}}]-(t)
RETURN count(l)

"""
graph.query(q)

In [None]:
q = f"""
MATCH (t:Company)
WHERE t.company_name = 'Tesla'
WITH t

MERGE (BusinessOverview:BusinessOverview)
WITH BusinessOverview, t

MERGE (BusinessOverview)<-[l:BUSIENSS_OVERVIEW {{description: "{business_overview}"}}]-(t)
RETURN count(l)

"""
graph.query(q)

In [None]:
q = f"""
MATCH (t:Company)
WHERE t.company_name = 'Tesla'
WITH t

MERGE (RegulatoryImpacts:RegulatoryImpacts)
WITH RegulatoryImpacts, t

MERGE (RegulatoryImpacts)<-[l:REGULATORY_IMPACTS {{description: "{regulatory}"}}]-(t)
RETURN count(l)

"""
graph.query(q)

In [None]:
q = f"""
MATCH (t:Company)
WHERE t.company_name = 'Tesla'
WITH t

MERGE (Competition:Competition)
WITH Competition, t

MERGE (Competition)<-[l:COMPETITION_INFORMATION {{description: "{competition}"}}]-(t)
RETURN count(l)

"""
graph.query(q)