## This notebook is to streamline the RAG model

In [1]:
# LangChain's core runnables for orchestrating tasks in workflows
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
# LangChain's core components for building custom prompts, handling messages, and parsing outputs
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

# Typing Imports
from typing import Tuple, List

# Integrating LangChain with Neo4j, which can be useful for tasks like combining graph databases and vector stores for advanced AI workflows.
# For example:
# We can use Neo4jGraph to retrieve structured graph data from Neo4j
# We can store and query document embeddings using Neo4jVector
# We can leverage LLMGraphTransformer to help the LLM reason about relationships within the graph
# We can use remove_lucene_chars to ensure that queries passed into Neo4j are well-formatted and don’t cause issues with search.
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Document Loaders and Text Splitters
# from langchain.document_loaders import WikipediaLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# LangChain components that interface with OpenAI models
# ChatOpenAI handles interactive conversations with a language model
# OpenAIEmbeddings transform text into vectors, stores and compares the semantic meaning of user inputs or documents in a vector store like Neo4jVector.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Neo4j & Graph Visualization
# To establish a connection with a Neo4j database and handling the graph database by running Cypher queries, interacting with nodes and relationships
from neo4j import GraphDatabase
# To visually represent the graph data retrieved from Neo4j
from yfiles_jupyter_graphs import GraphWidget

# FAISS (Facebook AI Similarity Search) stores text embeddings and then retrieves similar documents based on a query
from langchain.vectorstores import FAISS

# Chains for QA by combining a retrieval mechanism (like FAISS) with a language model
from langchain.chains import RetrievalQA

# Miscellaneous
import os
import warnings
import textwrap

#colab imports if running in Google colab
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
from torch.nn import CosineSimilarity
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


## Data loading

In [2]:
file_path = 'Combined_course_data.csv'
course = pd.read_csv(file_path)
course

Unnamed: 0,Title,Description,Subject
0,Introduction to Business Analytics,This course provides students with an introduc...,Computer Science
1,Business Analytics Immersion Programme,This course aims to equip students with a firs...,Computer Science
2,Econometrics Modeling for Business Analytics,This course provides the foundations to econom...,Computer Science
3,Data Management and Visualisation,This course aims to provide students with prac...,Computer Science
4,Feature Engineering for Machine Learning,This course covers topics that are important f...,Computer Science
...,...,...,...
1911,Introduction to Hyperledger Sovereign Identity...,"To the surprise of absolutely no one, trust is...",Computer Science
1912,A System View of Communications: From Signals ...,Have you ever wondered how information is tran...,Computer Science
1913,Scripting and Programming Foundations,Computer programs are abundant in many people'...,Computer Science
1914,Using GPUs to Scale and Speed-up Deep Learning,Training acomplex deep learning model with a v...,Data Science


In [3]:
file_path = 'wikidata.csv'
wikidata = pd.read_csv(file_path)
wikidata

Unnamed: 0,text,url,title
0,"Becurtovirus is a genus of viruses, in the fam...",https://en.wikipedia.org/wiki/Becurtovirus,Becurtovirus
1,Cyprinivirus is a genus of viruses in the orde...,https://en.wikipedia.org/wiki/Cyprinivirus,Cyprinivirus
2,"Glossinavirus is a genus of viruses, in the fa...",https://en.wikipedia.org/wiki/Glossinavirus,Glossinavirus
3,"Ichtadenovirus is a genus of viruses, in the f...",https://en.wikipedia.org/wiki/Ichtadenovirus,Ichtadenovirus
4,"Lambdatorquevirus is a genus of viruses, in th...",https://en.wikipedia.org/wiki/Lambdatorquevirus,Lambdatorquevirus
...,...,...,...
131044,A non-blanching rash (NBR) is a skin rash that...,https://en.wikipedia.org/wiki/Non-blanching%20...,Non-blanching rash
131045,"In organic chemistry, the term cyanomethyl (cy...",https://en.wikipedia.org/wiki/Cyanomethyl,Cyanomethyl
131046,Remaiten is malware which infects Linux on emb...,https://en.wikipedia.org/wiki/Remaiten,Remaiten
131047,Gradient-enhanced kriging (GEK) is a surrogate...,https://en.wikipedia.org/wiki/Gradient-enhance...,Gradient-enhanced kriging


In [4]:
course_transformed = pd.DataFrame({
    "content": course.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in course.columns]), axis=1)
})

# Transform `wikidata` DataFrame to a single-column format
wikidata_transformed = pd.DataFrame({
    "content": wikidata.apply(lambda row: ' | '.join([f"{col}: {row[col]}" for col in wikidata.columns]), axis=1)
})

# Display the transformed tables
print("Transformed Course Data:")
print(course_transformed.head())

print("\nTransformed Wikidata:")
print(wikidata_transformed.head())

Transformed Course Data:
                                             content
0  Title: Introduction to Business Analytics | De...
1  Title: Business Analytics Immersion Programme ...
2  Title: Econometrics Modeling for Business Anal...
3  Title: Data Management and Visualisation | Des...
4  Title: Feature Engineering for Machine Learnin...

Transformed Wikidata:
                                             content
0  text: Becurtovirus is a genus of viruses, in t...
1  text: Cyprinivirus is a genus of viruses in th...
2  text: Glossinavirus is a genus of viruses, in ...
3  text: Ichtadenovirus is a genus of viruses, in...
4  text: Lambdatorquevirus is a genus of viruses,...


In [5]:
wiki_embeddings_file = 'wiki_title_embeddings.npy'
wiki_title_embeddings = np.load(wiki_embeddings_file)
wiki_title_embeddings

array([[-0.01740786,  0.00442912, -0.09215238, ..., -0.02191604,
         0.07291625, -0.02235293],
       [-0.10091388,  0.0783674 , -0.04533364, ..., -0.1075331 ,
         0.04686709,  0.07207245],
       [-0.10018466, -0.00640676, -0.0114509 , ..., -0.14957273,
         0.06115797,  0.02614287],
       ...,
       [-0.03868212,  0.05411112,  0.00084907, ...,  0.01953804,
        -0.01381   , -0.04266216],
       [-0.09186076, -0.1078757 ,  0.04518463, ..., -0.042975  ,
        -0.03663828,  0.01403402],
       [-0.06280275,  0.0021886 , -0.00058878, ..., -0.0114022 ,
        -0.0395432 , -0.0105731 ]], dtype=float32)

## Functions

### First filter

In [6]:
# Preload your model and wiki_title_embeddings outside the function for efficiency
dimension = wiki_title_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(wiki_title_embeddings)

def wiki_title_filter_with_course_info(num_candidates, course_info, wikidata_transformed, model = SentenceTransformer('all-MiniLM-L6-v2')):
    """
    Filters the top relevant Wikipedia titles based on the course information.

    Parameters:
    - num_candidates (int): The number of top candidates to retrieve.
    - course_info (str): The course information text.
    - wikidata_transformed (DataFrame): The transformed DataFrame containing Wikipedia data.

    Returns:
    - DataFrame: A DataFrame containing the top relevant Wikipedia entries.
    """
    # Step 1: Encode the course info to create an embedding
    course_embedding = model.encode(course_info)

    # Step 2: Search for the most relevant Wikipedia titles using FAISS
    _, top_k_indices = faiss_index.search(np.array([course_embedding]), num_candidates)

    # Step 3: Filter the top Wikipedia entries
    top_wikidata = wikidata_transformed.iloc[top_k_indices[0]].reset_index(drop=True)

    return top_wikidata


### Second filter

In [7]:
import pandas as pd
from keybert import KeyBERT
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn import CosineSimilarity

# Define your utility functions
def encode_text(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64).to(device)
    embeddings = model(**inputs).last_hidden_state[:, 0, :]  # CLS token embedding
    return embeddings

def extract_keywords(content, model):
    keywords = model.extract_keywords(content, keyphrase_ngram_range=(3, 3), stop_words='english',
                                      use_maxsum=True, nr_candidates=20, top_n=5)
    merged_keywords = " ".join([kw[0] for kw in keywords])
    return merged_keywords


def refine_user_query_1(query, kw):
    return query + "which has following keywords:" + kw

In [8]:
def get_top_candidates(course_info, user_query, top_500_wikidata, tokenizer, query_model, document_model, kw_model, top_n=50):
    """
    Ranks wiki documents based on similarity to the user query and returns the top candidates.

    Parameters:
    - user_query (str): The user's query.
    - top_500_wikidata (DataFrame): DataFrame containing the top 500 filtered wiki data.
    - tokenizer (Tokenizer): Tokenizer for encoding text.
    - query_model (Model): Model for encoding query text.
    - document_model (Model): Model for encoding document text.
    - top_n (int): Number of top candidates to return.

    Returns:
    - List[Dict]: List of dictionaries containing content and similarity score for top candidates.
    """
    # Encode the user query using the query model
    merged_keywords = extract_keywords(course_info, kw_model)
    user_query = refine_user_query_1(user_query, merged_keywords)
    query_embedding = encode_text(user_query, tokenizer, query_model)
    
    top_candidates = []

    # Iterate over each document in the top 500 filtered data
    for _, row in top_500_wikidata.iterrows():
        # Embed the document content using the document model
        doc_embedding = encode_text(row['content'], tokenizer, document_model)
        
        # Calculate similarity score between query and document embeddings
        similarity_score = cosine_sim(query_embedding, doc_embedding).item()
        
        # Append content and similarity score to the list
        top_candidates.append({
            "Content": row['content'],
            "Similarity Score": similarity_score
        })
    
    # Sort candidates by similarity score in descending order and select top N
    top_candidates = sorted(top_candidates, key=lambda x: x["Similarity Score"], reverse=True)[:top_n]
    
    return top_candidates


### Third Filter

In [9]:
from rerankers import Reranker
import pandas as pd

# Initialize the cross-encoder ranker
ranker = Reranker('cross-encoder')

def rerank_with_cross_encoder(user_query, top_candidates_df, top_n=5):
    """
    Reranks the top articles based on the user query using a cross-encoder model.

    Parameters:
    - user_query (str): The user's query.
    - top_candidates_df (DataFrame): DataFrame containing the initial top candidate articles.
    - top_n (int): Number of top articles to return after reranking.

    Returns:
    - List[str]: A list containing the content of the top re-ranked articles.
    """
    # Prepare the documents and their IDs for the ranking function
    docs = top_candidates_df["Content"].tolist()
    doc_ids = list(range(len(docs)))

    # Use the rank method to get scores and ranks for each document
    results = ranker.rank(query=user_query, docs=docs, doc_ids=doc_ids)

    # Extract the content of the top N re-ranked articles based on their ranks
    top_articles = [result.document.text for result in sorted(results.results, key=lambda x: x.rank)[:top_n]]
    
    return top_articles


Loading default cross-encoder model for language en
Default Model: mixedbread-ai/mxbai-rerank-base-v1
Loading TransformerRanker model mixedbread-ai/mxbai-rerank-base-v1 (this message can be suppressed by setting verbose=0)
No device set
Using device cpu
No dtype set
Using dtype torch.float32
Loaded model mixedbread-ai/mxbai-rerank-base-v1
Using device cpu.
Using dtype torch.float32.


### Chunking

In [None]:
from langchain.schema import Document

def load_text_file(filename):
    # Load text data from a .txt file
    with open(filename, "r", encoding="utf-8") as file:
        text = file.read()
    return text

import nltk
import re
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Download NLTK data files (only needs to be done once)
nltk.download('punkt')

# Initialize the Sentence Transformer model for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a suitable model

def enhanced_split_documents(final_document: str, chunk_size=512, min_chunk_size=256, max_chunk_size=768):
    """
    Splits the final_document into semantically coherent chunks using adaptive chunking.
    
    Parameters:
    - final_document (str): The full text to split.
    - chunk_size (int): Target length of each chunk (in words).
    - min_chunk_size (int): Minimum length of each chunk to ensure meaningful content.
    - max_chunk_size (int): Maximum allowed length of a chunk.
    
    Returns:
    - List[Document]: List of Document objects with split content and metadata.
    """
    # Tokenize the document into sentences
    sentences = nltk.sent_tokenize(final_document)

    chunks = []
    current_chunk = []
    current_length = 0

    for i, sentence in enumerate(sentences):
        # Count the number of words in the sentence
        sentence_length = len(re.findall(r'\w+', sentence))

        # If adding the sentence doesn't exceed the max chunk size, add it
        if current_length + sentence_length <= max_chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            # If the current chunk is smaller than min_chunk_size, force add sentences
            if current_length < min_chunk_size:
                current_chunk.append(sentence)
                current_length += sentence_length
                continue  # Continue adding sentences until we reach min_chunk_size

            # Combine sentences into a chunk
            chunk_text = ' '.join(current_chunk)
            chunk_metadata = {
                'start_sentence': i - len(current_chunk),
                'end_sentence': i,
                'chunk_length': current_length,
            }
            chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))

            # Start a new chunk with the current sentence
            current_chunk = [sentence]
            current_length = sentence_length

    # Add the last chunk
    if current_chunk:
        chunk_text = ' '.join(current_chunk)
        chunk_metadata = {
            'start_sentence': len(sentences) - len(current_chunk),
            'end_sentence': len(sentences),
            'chunk_length': current_length,
        }
        chunks.append(Document(page_content=chunk_text, metadata=chunk_metadata))

    # Implement semantic overlap
    enhanced_chunks = []
    for idx, chunk in enumerate(chunks):
        # Include overlapping sentences based on semantic similarity
        if idx > 0:
            # Calculate semantic similarity between current chunk and previous chunk
            previous_chunk_embedding = model.encode(chunks[idx - 1].page_content)
            current_chunk_embedding = model.encode(chunk.page_content)
            similarity = cosine_similarity(
                [previous_chunk_embedding], [current_chunk_embedding]
            )[0][0]

            if similarity < 0.7:
                # Low similarity, consider adding overlap
                overlap_sentences = chunks[idx - 1].page_content.split()[-20:]  # Last 20 words
                chunk.page_content = ' '.join(overlap_sentences) + ' ' + chunk.page_content
                chunk.metadata['overlap_added'] = True
            else:
                chunk.metadata['overlap_added'] = False
        else:
            chunk.metadata['overlap_added'] = False

        enhanced_chunks.append(chunk)

    return enhanced_chunks



In [11]:
def save_text_to_file(text, filename="final_document.txt"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(text)
    print(f"Document saved as {filename}")



In [12]:
def construct_document(course_info, top_articles):
    # Template to create a more readable and coherent document structure
    document = f"Course Overview:\n{course_info}\n\n"
    for i, article in enumerate(top_articles, 1):
        document += f"Related Article {i}:\n"
        document += f"Title: Article {i}\n"
        document += f"Content: {article}\n"
        if i < len(top_articles):
            document += "\n\nIn addition, the following article provides insights:\n\n"
    return document



### RAG Functions

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
query_model = AutoModel.from_pretrained("query_model_scidocs").to(device)
document_model = AutoModel.from_pretrained("document_model_scidocs").to(device)
tokenizer = AutoTokenizer.from_pretrained("tokenizer_scidocs")
cosine_sim = CosineSimilarity(dim=1)
kw_model = KeyBERT()
ranker = Reranker('cross-encoder')

os.environ["OPENAI_API_KEY"] = "" # Apply your own key
os.environ["NEO4J_URI"] = '' # Apply your own URI
os.environ["NEO4J_USERNAME"] = "neo4j" # by default or use your own
os.environ["NEO4J_PASSWORD"] = '' # Apply your own password

graph = Neo4jGraph(url=os.environ["NEO4J_URI"], username=os.environ["NEO4J_USERNAME"], password=os.environ["NEO4J_PASSWORD"]) 

llm=ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview") # gpt-4-0125-preview occasionally has issues but in theory you would want to use the most capable model to construct the graph
llm_transformer = LLMGraphTransformer(llm=llm)

embeddings = OpenAIEmbeddings()

class Entities(BaseModel):
    """Identifying information about entities."""

    # This line structures the output of the LLM to give a List of names.
    names: List[str] = Field(
        ...,
        description="All the course knowledge, teaching material, deliverable, expectation, level and assessment entities "
        "appear in the text",
    )

# Each tuple represents a message with a specific role and content that helps define how different messages should be strucutured
# and formatted when interacting with the llm.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are tasked with extracting specific entities from the text. Focus on course knowledge, teaching material, deliverable, expectation, level and assessment entities",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

# Combine the prompt template (prompt) with the language model that specifies that the output should be structured in a particular way, specifically to extract entitites.
entity_chain = prompt | llm.with_structured_output(Entities)

import re

def remove_lucene_chars(input: str) -> str:
    """
    Remove special characters that are not allowed in Lucene queries.
    """
    return re.sub(r'[^a-zA-Z0-9\s]', '', input)

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspellings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]

    if not words:
        return ""

    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"

    return full_text_query.strip()

def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question.
    """
    result = ""
    entities = entity_chain.invoke({"question": question})

    for entity in entities.names:
        # This Neo4j Cypher query performs a full-text search on nodes that have the required label, retrieving the top two matches
        # based on the search term provided. After this, the query then looks for relationships that point to or from this entity,
        # excluding relationships of type 'MENTIONS'.
        response = graph.query(
            """
            CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
            YIELD node, score
            WITH node
            MATCH (node)-[r]->(neighbor)
            WHERE type(r) <> 'MENTIONS'
            RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
            UNION ALL
            MATCH (neighbor)-[r]->(node)
            WHERE type(r) <> 'MENTIONS'
            RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )

        # Append results
        result += "\n".join([el['output'] for el in response]) + "\n"

    return result.strip()

# Define a function to combine both structured and unstructred data defined above into a prompt to be fed to the LLM
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

# Formats chat history to incorporate into a query for the LLM
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

template = """You are an AI tutor assisting a student with a study plan for their course.

Context:
{context}

First, generate a structured study plan based on the information in the context. Be detailed and clear.

Then, provide an explanation about what information in the provided document did you use.

Question: {question}

Response:
Study Plan:
"""

# Create the prompt using the revised template
prompt = ChatPromptTemplate.from_template(template)



# LLM Generation by running two operations in parallel: retrieve context and passthrough quesiton
final_chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)


Loading default cross-encoder model for language en
Default Model: mixedbread-ai/mxbai-rerank-base-v1
Loading TransformerRanker model mixedbread-ai/mxbai-rerank-base-v1 (this message can be suppressed by setting verbose=0)
No device set
Using device cpu
No dtype set
Using dtype torch.float32
Loaded model mixedbread-ai/mxbai-rerank-base-v1
Using device cpu.
Using dtype torch.float32.


## RAG Pipeline

In [None]:


# User query for the study plan
user_query = "Can you help me make a study plan for this course?"

# Initialize a list to store rows of data for the DataFrame
results_data = []

for i in range(2):
    # Randomly select a course document
    course_row = course_transformed.sample(n=1).iloc[0]
    course_document = course_row['content']
    
    # Filter the top 500 Wikidata entries related to the course document
    top_500_wikidata = wiki_title_filter_with_course_info(500, course_document, wikidata_transformed)
    
    # Get the top 50 candidates based on similarity scores
    top_50_candidates = get_top_candidates(
        course_document,
        user_query,
        top_500_wikidata,
        tokenizer,
        query_model,
        document_model,
        kw_model,
        50
    )
    
    # Sort and select the top 50 candidates
    top_50_candidates_df = pd.DataFrame(top_50_candidates).sort_values(by="Similarity Score", ascending=False).head(50)
    
    # Rerank the top 50 candidates using the cross-encoder model
    top_5_articles = rerank_with_cross_encoder(user_query, top_50_candidates_df, top_n=5)

    final_document = construct_document(course_document, top_5_articles)
    # Usage example with your existing variables
    #documents = enhanced_split_documents(final_document)

    raw_documents = [Document(page_content=final_document)]
    
    # Initialize the TokenTextSplitter
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=20)



    # Split the Document object into smaller chunks
    documents = text_splitter.split_documents(raw_documents)

    vectorstore = FAISS.from_documents(documents, embeddings)

    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

    graph_documents = llm_transformer.convert_to_graph_documents(documents)

    # To create a new database, you can use Cypher query to delete all nodes and relationships
    clear_db_query = """
    MATCH (n)
    DETACH DELETE n
    """

    # Execute the query to clear the database
    graph.query(clear_db_query)

    graph.add_graph_documents(
    graph_documents,
    # Ensures that each entity in graph_documents is labeled with its base entity type
    baseEntityLabel=True,
    # Indicate that the source information (like the original document or context) should be included in the graph nodes or edges.
    include_source=True
    )

    vector_index = Neo4jVector.from_existing_graph(
    # Uses a model from OpenAI that converts text into vector embeddings which are used for vector-based search
    OpenAIEmbeddings(),
    # Search for similar words using a hybrid approach, combining both keyword-based and vector-based searches.
    search_type="hybrid",
    # Only nodes with the Document label will be indexed
    node_label="Document",
    # Within the node, we will return the 'text' property
    text_node_properties=["text"],
    embedding_node_property="embedding"
    )

    response = final_chain.invoke({"question": user_query})
    wrapped_response = textwrap.fill(response, width=80)
    
    print(wrapped_response)

    row = {
        "Course Information": course_document,
        "Response": wrapped_response
    }
    
    # Append the row data to the results list
    results_data.append(row)


        
    


In [28]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results_data)

# Display the DataFrame to check the results
results_df.head(2)

Unnamed: 0,Course Information,Document
0,Title: Thematic Systems Project I | Descriptio...,### Study Plan for Thematic Systems Project I ...
1,Title: Implementing Hangman Game Using Basics ...,### Study Plan for Implementing Hangman Game U...
