In [1]:
# LangChain's core runnables for orchestrating tasks in workflows
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
# LangChain's core components for building custom prompts, handling messages, and parsing outputs
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

# Typing Imports
from typing import Tuple, List

# Integrating LangChain with Neo4j, which can be useful for tasks like combining graph databases and vector stores for advanced AI workflows.
# For example:
# We can use Neo4jGraph to retrieve structured graph data from Neo4j
# We can store and query document embeddings using Neo4jVector
# We can leverage LLMGraphTransformer to help the LLM reason about relationships within the graph
# We can use remove_lucene_chars to ensure that queries passed into Neo4j are well-formatted and don’t cause issues with search.
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Document Loaders and Text Splitters
# from langchain.document_loaders import WikipediaLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# LangChain components that interface with OpenAI models
# ChatOpenAI handles interactive conversations with a language model
# OpenAIEmbeddings transform text into vectors, stores and compares the semantic meaning of user inputs or documents in a vector store like Neo4jVector.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Neo4j & Graph Visualization
# To establish a connection with a Neo4j database and handling the graph database by running Cypher queries, interacting with nodes and relationships
from neo4j import GraphDatabase
# To visually represent the graph data retrieved from Neo4j
from yfiles_jupyter_graphs import GraphWidget

# FAISS (Facebook AI Similarity Search) stores text embeddings and then retrieves similar documents based on a query
from langchain.vectorstores import FAISS

# Chains for QA by combining a retrieval mechanism (like FAISS) with a language model
from langchain.chains import RetrievalQA

# Miscellaneous
import os
import warnings
import textwrap

#colab imports if running in Google colab
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

warnings.filterwarnings("ignore")


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


## Prepare

In [None]:
os.environ["OPENAI_API_KEY"] = "" # Apply your own key
os.environ["NEO4J_URI"] = '' # Apply your own URI
os.environ["NEO4J_USERNAME"] = "neo4j" # by default or use your own
os.environ["NEO4J_PASSWORD"] = '' # Apply your own password

# Create a connection to the Neo4j database
# graph = Neo4jGraph()
graph = Neo4jGraph(url=os.environ["NEO4J_URI"], username=os.environ["NEO4J_USERNAME"], password=os.environ["NEO4J_PASSWORD"])

In [3]:
import pandas as pd
import numpy as np
file_path = 'Combined_course_data.csv'
course = pd.read_csv(file_path)
course

Unnamed: 0,Title,Description,Subject
0,Introduction to Business Analytics,This course provides students with an introduc...,Computer Science
1,Business Analytics Immersion Programme,This course aims to equip students with a firs...,Computer Science
2,Econometrics Modeling for Business Analytics,This course provides the foundations to econom...,Computer Science
3,Data Management and Visualisation,This course aims to provide students with prac...,Computer Science
4,Feature Engineering for Machine Learning,This course covers topics that are important f...,Computer Science
...,...,...,...
1911,Introduction to Hyperledger Sovereign Identity...,"To the surprise of absolutely no one, trust is...",Computer Science
1912,A System View of Communications: From Signals ...,Have you ever wondered how information is tran...,Computer Science
1913,Scripting and Programming Foundations,Computer programs are abundant in many people'...,Computer Science
1914,Using GPUs to Scale and Speed-up Deep Learning,Training acomplex deep learning model with a v...,Data Science


In [4]:
file_path = 'wikidata.csv'
wikidata = pd.read_csv(file_path)
wikidata

Unnamed: 0,text,url,title
0,"Becurtovirus is a genus of viruses, in the fam...",https://en.wikipedia.org/wiki/Becurtovirus,Becurtovirus
1,Cyprinivirus is a genus of viruses in the orde...,https://en.wikipedia.org/wiki/Cyprinivirus,Cyprinivirus
2,"Glossinavirus is a genus of viruses, in the fa...",https://en.wikipedia.org/wiki/Glossinavirus,Glossinavirus
3,"Ichtadenovirus is a genus of viruses, in the f...",https://en.wikipedia.org/wiki/Ichtadenovirus,Ichtadenovirus
4,"Lambdatorquevirus is a genus of viruses, in th...",https://en.wikipedia.org/wiki/Lambdatorquevirus,Lambdatorquevirus
...,...,...,...
131044,A non-blanching rash (NBR) is a skin rash that...,https://en.wikipedia.org/wiki/Non-blanching%20...,Non-blanching rash
131045,"In organic chemistry, the term cyanomethyl (cy...",https://en.wikipedia.org/wiki/Cyanomethyl,Cyanomethyl
131046,Remaiten is malware which infects Linux on emb...,https://en.wikipedia.org/wiki/Remaiten,Remaiten
131047,Gradient-enhanced kriging (GEK) is a surrogate...,https://en.wikipedia.org/wiki/Gradient-enhance...,Gradient-enhanced kriging


## Use SentenceTransformer to do embedding for each wikipedia article title

In [None]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

# Load the SentenceTransformer model for encoding
model = SentenceTransformer('all-MiniLM-L6-v2')  # Using a small model for efficiency; you can choose a larger one if needed


In [None]:
#Encode Wikipedia titles
#wiki_titles = wikidata['title'].tolist()
#wiki_title_embeddings = model.encode(wiki_titles, convert_to_tensor=True)

In [None]:
wiki_embeddings_file = 'wiki_title_embeddings.npy'
wiki_title_embeddings = np.load(wiki_embeddings_file)

In [25]:
wiki_title_embeddings

tensor([[-0.0174,  0.0044, -0.0922,  ..., -0.0219,  0.0729, -0.0224],
        [-0.1009,  0.0784, -0.0453,  ..., -0.1075,  0.0469,  0.0721],
        [-0.1002, -0.0064, -0.0115,  ..., -0.1496,  0.0612,  0.0261],
        ...,
        [-0.0387,  0.0541,  0.0008,  ...,  0.0195, -0.0138, -0.0427],
        [-0.0919, -0.1079,  0.0452,  ..., -0.0430, -0.0366,  0.0140],
        [-0.0628,  0.0022, -0.0006,  ..., -0.0114, -0.0395, -0.0106]])