## Articles used to help create this file

In [None]:
# https://medium.com/neo4j/langchain-library-adds-full-support-for-neo4j-vector-index-fa94b8eab334
# https://medium.com/towards-data-science/efficient-semantic-search-over-unstructured-text-in-neo4j-8179ad7ff451
# https://frodnar.github.io/posts/2023-09-30_building_llm_chatbot_neo4j/
# https://python.langchain.com/docs/integrations/vectorstores/neo4jvector
# https://python.langchain.com/docs/use_cases/more/graph/graph_cypher_qa

In [None]:
# FIRST STEP - Load the database dump into Neo4j Aura DB 

## Load Required Libraries & Connect to APIs

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Neo4jVector
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
import pandas as pd
from langchain.graphs import Neo4jGraph
import os
import openai
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
from sklearn.decomposition import PCA
import plotly.express as px
import umap
from sklearn.cluster import KMeans
from langchain.text_splitter import RecursiveCharacterTextSplitter



In [2]:
url = "neo4j+s://483c47f7.databases.neo4j.io"
username = "neo4j"
password = ""

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

  self._driver.verify_connectivity()


ValueError: Could not connect to Neo4j database. Please ensure that the url is correct

In [None]:
openai_api_key = ""
openai.api_key = openai_api_key
# openai.Model.list()

In [None]:
os.environ['OPENAI_API_KEY'] = openai_api_key

In [None]:
driver = GraphDatabase.driver(url, auth=(username, password))
session = driver.session()

# Begin Analysis

#### Grab a document and check to see if it's chunked or if the whole document has been embedded

In [None]:
def get_dataframe(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

# Now use the function to get a DataFrame:
df = get_dataframe("MATCH (a) RETURN id(a) LIMIT 1")
driver.close()
print(df)

In [None]:
q = "MATCH (a:Document) RETURN count(a) as Number_Documets LIMIT 1"

df = get_dataframe(q)
driver.close()
print(df)

In [None]:
q = """
MATCH (a:Document)
RETURN substring(toString(a.date), 0, 4) AS Year, count(a) AS Number_Documents_by_Year
ORDER BY Year
"""

df = get_dataframe(q)
driver.close()
print(df)

# Explore the topics

In [None]:
q = """
MATCH (a:Topic)<-[r:IS_IN]-(b:Document)
RETURN a.name as Topic_Name, count(b) as topic_count
ORDER BY topic_count DESC
LIMIT 5
"""

df = get_dataframe(q)
driver.close()
print(df)

In [None]:
q = """
MATCH (a:Document)-[r:ASSIGNED_TO]->(b)
//WHERE b.name CONTAINS ('Bank')
RETURN b.name as Assignee_Name, count(a) as document_count
ORDER BY document_count DESC
LIMIT 5
"""

df = get_dataframe(q)
driver.close()
print(df)

In [None]:
q = """
MATCH (c:Topic)<-[r2:IS_IN]-(a:Document)-[r:ASSIGNED_TO]->(b)
//WHERE b.name CONTAINS ('Bank')
RETURN b.name as Assignee_Name, c.name as Topic_Name, count(a) as document_count
ORDER BY document_count DESC
LIMIT 5
"""

df = get_dataframe(q)
driver.close()
display(df)

In [None]:
q = """
MATCH (c:Topic)<-[r2:IS_IN]-(a:Document)-[r:ASSIGNED_TO]->(b)
WHERE b.name = 'MICROSOFT TECHNOLOGY LICENSING, LLC'
RETURN a.abstract
"""

df = get_dataframe(q)
driver.close()
#print(df)

In [None]:
print(df.loc[0, 'a.abstract'])

# Gather data to send to ChatGPT

In [None]:
q = """
MATCH (c:Topic)<-[r2:IS_IN]-(a:Document)-[r:ASSIGNED_TO]->(b:Assignee)
WHERE c.name = 'Machine Learning'
RETURN id(a) as ida
, a.title as Patent_Title 
, b.name as Patent_Owner
, a.abstract as Patent_Abstract
LIMIT 300
"""

df = get_dataframe(q)
driver.close()
display(df.head())

In [None]:
def summarize_abstract(abstract_text):
    response = openai.Completion.create(
      model="text-davinci-002", 
      prompt=f"Summarize the following patent abstract in laymen's terms in fewer than 100 tokens: {abstract_text}",
      max_tokens=100
    )
    return response.choices[0].text.strip()

# Loop through the dataframe and apply the summary function
df['Summary'] = df['Patent_Abstract'].apply(summarize_abstract)

# Display the updated dataframe with summaries
display(df.head())

In [None]:
#View the response from ChatGPT
pd.set_option('display.max_colwidth', None)
print(df['Summary'])

# Write the summaries back to Neo4j

In [None]:
# Update node summary
def update_node_summary(session, node_id, summary):
    query = """
    MATCH (a:Document) 
    WHERE id(a) = $node_id
    SET a.Summary = $summary
    """
    session.run(query, node_id=node_id, summary=summary)

# Loop through DataFrame and update each node
with driver.session() as session:
    for index, row in df.iterrows():
        node_id = row['ida']
        summary = row['Summary']
        update_node_summary(session, node_id, summary)

driver.close()

# Create Embeddings

In [None]:
q = f"""MATCH (a:Document) 
    WHERE a.Summary is not null
    RETURN id(a) as ida, a.Summary as Summary
    """

df = get_dataframe(q)
driver.close()
print(df)

In [None]:
summaries = df['Summary'].tolist()

In [None]:
empty_indices = [i for i, summary in enumerate(summaries) if not summary or pd.isna(summary)]
print(empty_indices)

In [None]:
print(summaries[140])

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 10000,
    chunk_overlap  = 0,
    length_function = len,
    add_start_index = True,
)

In [None]:
texts = text_splitter.create_documents(summaries)

In [None]:
len(texts)

In [None]:
len(summaries)

In [None]:
graph.query("""
CALL db.index.vector.createNodeIndex(
  'patent_summary_embeddings', // index name
  'Chunk',                     // node label
  'embedding',                 // node property
   1536,                       // vector size
   'cosine'                    // similarity metric
)
""")

In [None]:
neo4j_vector = Neo4jVector.from_documents(
    texts,
    OpenAIEmbeddings(),
    url=url,
    username=username,
    password=password
)

# Query / Return Embeddings

In [None]:
q = f"""MATCH (a:Chunk) 
    RETURN id(a) as ida
    , a.embedding as Embedding
    , a.text as Text
    """

df = get_dataframe(q)
driver.close()
print(df)