In [1]:
# LangChain's core runnables for orchestrating tasks in workflows
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
# LangChain's core components for building custom prompts, handling messages, and parsing outputs
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

# Typing Imports
from typing import Tuple, List

# Integrating LangChain with Neo4j, which can be useful for tasks like combining graph databases and vector stores for advanced AI workflows.
# For example:
# We can use Neo4jGraph to retrieve structured graph data from Neo4j
# We can store and query document embeddings using Neo4jVector
# We can leverage LLMGraphTransformer to help the LLM reason about relationships within the graph
# We can use remove_lucene_chars to ensure that queries passed into Neo4j are well-formatted and don’t cause issues with search.
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer

# Document Loaders and Text Splitters
# from langchain.document_loaders import WikipediaLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter

# LangChain components that interface with OpenAI models
# ChatOpenAI handles interactive conversations with a language model
# OpenAIEmbeddings transform text into vectors, stores and compares the semantic meaning of user inputs or documents in a vector store like Neo4jVector.
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Neo4j & Graph Visualization
# To establish a connection with a Neo4j database and handling the graph database by running Cypher queries, interacting with nodes and relationships
from neo4j import GraphDatabase
# To visually represent the graph data retrieved from Neo4j
from yfiles_jupyter_graphs import GraphWidget

# FAISS (Facebook AI Similarity Search) stores text embeddings and then retrieves similar documents based on a query
from langchain.vectorstores import FAISS

# Chains for QA by combining a retrieval mechanism (like FAISS) with a language model
from langchain.chains import RetrievalQA

# Miscellaneous
import os
import warnings
import textwrap

#colab imports if running in Google colab
try:
  import google.colab
  from google.colab import output
  output.enable_custom_widget_manager()
except:
  pass

warnings.filterwarnings("ignore")



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


### The purpose of this notebook is to get an result for the specific course Data-Driven Marketing. It is for us to understand the model behaviour and adjust it accordingly

## Initialization

In [None]:
os.environ["OPENAI_API_KEY"] = "" # Apply your own key
os.environ["NEO4J_URI"] = '' # Apply your own URI
os.environ["NEO4J_USERNAME"] = "neo4j" # by default or use your own
os.environ["NEO4J_PASSWORD"] = '' # Apply your own password

# Create a connection to the Neo4j database
# graph = Neo4jGraph()
graph = Neo4jGraph(url=os.environ["NEO4J_URI"], username=os.environ["NEO4J_USERNAME"], password=os.environ["NEO4J_PASSWORD"]) # Explicitly pass the connection details to Neo4jGraph

In [3]:
llm=ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview") # gpt-4-0125-preview occasionally has issues but in theory you would want to use the most capable model to construct the graph
llm_transformer = LLMGraphTransformer(llm=llm)

### An example on how the LLM will respond to a prompt which it has no knowledge on.

#### Here we reuse the query from User_query_based_filter.ipynb

In [4]:
print('Example of LLM without RAG process: \n')
response = llm("Can you help me make a study plan for the course Data-driven marketing?").content
wrapped_response = textwrap.fill(response, width=80)
print(wrapped_response)

Example of LLM without RAG process: 

Creating a study plan for a course like Data-driven Marketing is a great way to
ensure you cover all necessary material, understand the concepts thoroughly, and
apply them effectively. Here's a step-by-step guide to help you create a
comprehensive study plan:  ### 1. Understand the Course Structure - **Review the
Syllabus:** Start by thoroughly reviewing the course syllabus. Note down the key
topics, assignments, project work, and exams. - **Identify Key Modules:** Break
down the syllabus into main modules or sections. This could be based on weeks,
topics, or types of marketing strategies discussed.  ### 2. Set Clear Objectives
- **Learning Goals:** Define what you want to achieve by the end of the course.
This could range from understanding specific marketing models to being able to
analyze and interpret data effectively. - **Skill Development:** Identify the
skills you aim to develop, such as analytical thinking, data visualization, or
strategic 

## Data Loading

In [5]:
def load_text_file(filename):
    # Load text data from a .txt file
    with open(filename, "r", encoding="utf-8") as file:
        text = file.read()
    return text



In [7]:
from langchain.schema import Document

filename = "final_document.txt"
text_data = load_text_file(filename)

raw_documents = [Document(page_content=text_data)]

# Initialize the TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=20)

# Split the Document object into smaller chunks
documents = text_splitter.split_documents(raw_documents)

# Display the split documents
for i, doc in enumerate(documents, 1):
    print(f"Document Chunk {i}:\n{doc.page_content}\n")

Document Chunk 1:
Course Overview:
Title: Data-Driven Marketing | Description: In today’s environment, marketing or business analysts require tools and techniques to both quantify the strategic value of marketing initiatives, and to maximize marketing campaign performance. This course aims to teach students concepts, methods and tools to demonstrate the return on investment (ROI) of marketing activities and to leverage on data and marketing analytics to make better and more informed marketing decisions. Course topics covered include marketing performance management, marketing metrics, data management, market response and diffusion models, market and customer segmentation models, analytic marketing and value driven segmentation, digital media marketing analytics, etc. Students will have access to | Subject: Computer Science

Related Article 1:
Title: Article 1
Content: text: User behavior analytics (UBA) is a cybersecurity process about detection of insider threats, targeted attacks, an

## RAG Structure without graph

In [8]:
embeddings = OpenAIEmbeddings()

vectorstore = FAISS.from_documents(documents, embeddings)

In [10]:
# Print basic information
print(f"Type of vectorstore: {type(vectorstore)}")
print(f"Number of documents: {len(vectorstore.index_to_docstore_id)}")

# Print information about the underlying FAISS index
faiss_index = vectorstore.index
print(f"\nFAISS Index type: {type(faiss_index)}")
print(f"FAISS Index dimension: {faiss_index.d}")
print(f"Total number of vectors: {faiss_index.ntotal}")


# Print some example document IDs
print("\nExample document IDs:")
for i, doc_id in list(vectorstore.index_to_docstore_id.items())[:len(vectorstore.index_to_docstore_id)]:
    print(f"Index {i}: Document ID {doc_id}")


print('The last two vector embeddings stored in vectorstore:\n')
vectors = vectorstore.index.reconstruct_n(len(vectorstore.index_to_docstore_id)-2, 2)
print(vectors)

Type of vectorstore: <class 'langchain_community.vectorstores.faiss.FAISS'>
Number of documents: 10

FAISS Index type: <class 'faiss.swigfaiss_avx2.IndexFlatL2'>
FAISS Index dimension: 1536
Total number of vectors: 10

Example document IDs:
Index 0: Document ID ab929b79-2688-4ffa-9515-de0dbbca7863
Index 1: Document ID a6cba1f6-c64e-4577-9595-34142793a182
Index 2: Document ID 81872651-e924-4567-b0b8-b66861de37a1
Index 3: Document ID 1b1cb805-1a6e-4791-9754-40faf96ed201
Index 4: Document ID fea3289c-eb8a-4ec3-be29-8416f0acad41
Index 5: Document ID 2dd30020-91fd-4c30-a182-f09e348a9112
Index 6: Document ID ad1dd5b9-7caf-453b-82c5-bd490ee2aa6c
Index 7: Document ID 25e9b27e-e543-4544-9c74-87e82c9d834e
Index 8: Document ID 8e90d7c1-6bd3-4818-a1af-5a675749405e
Index 9: Document ID 6665f9c3-3319-486d-a867-734f21f07e2b
The last two vector embeddings stored in vectorstore:

[[ 0.00092509  0.0529552  -0.0011348  ... -0.01423323 -0.0100052
  -0.04031847]
 [ 0.00058459  0.01741397  0.01649816 ... -0

In [11]:
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever())

In [13]:
print('Demonstration of RAG response:\n')
#question_step3a = 'Who teaches R in the NUS course about business analytics, what else is taught and how are students graded?'
question_step3a = 'Can you help me make a study plan for the course?'

response = qa_chain.run(question_step3a)
wrapped_response = textwrap.fill(response, width=80)
print(wrapped_response)

Demonstration of RAG response:

Creating a study plan for the "Data-Driven Marketing" course involves breaking
down the course content into manageable sections, allocating time for each
topic, and incorporating review sessions. Here's a suggested study plan based on
the course overview and related topics. This plan assumes a 12-week course
duration, which can be adjusted based on your actual course timeline.  ### Week
1-2: Introduction to Data-Driven Marketing - **Objective:** Understand the
basics of data-driven marketing, its importance, and how it differs from
traditional marketing approaches. - **Activities:**   - Read course materials on
the strategic value of marketing initiatives.   - Watch introductory videos on
data-driven marketing.   - Participate in forum discussions about the role of
data in marketing decisions.  ### Week 3-4: Marketing Performance Management &
Metrics - **Objective:** Learn how to measure and manage marketing performance
using various metrics. - **Activit

### We can see that the response is refined based on the documents, it is more related to the documents now. However, we notice that the wiki data is not fully utilized, thus we intend to explore more.

## RAG with Graph

In [14]:
graph_documents = llm_transformer.convert_to_graph_documents(documents)

In [15]:
print(f"count of documents:{len(graph_documents)}")
print(f"count of nodes in the first document chunk:{len(graph_documents[3].nodes)}")
print(f"count of relationships in the first document chunk:{len(graph_documents[3].relationships)}")

count of documents:10
count of nodes in the first document chunk:10
count of relationships in the first document chunk:3


In [16]:
print("As shown below, each of the document is split into nodes and relationships:\n")

# Iterate through each item in graph_documents
for item in graph_documents[3].nodes:
    # Print details of the Node
    print(f"Node ID: {item.id}")
    print(f"Node Type: {item.type}")
    print(f"Node Properties: {item.properties}")
    print("-" * 50)  # Separator for clarity

for item in graph_documents[3].relationships:
    # Print details of the relationships
    print(f"Relationship from: {item.source.id} (Type: {item.source.type})")
    print(f"  to: {item.target.id} (Type: {item.target.type})")
    print(f"Relationship Type: {item.type}")
    print(f"Relationship Properties: {item.properties}")
    print("-" * 50)  # Separator for clarity

As shown below, each of the document is split into nodes and relationships:

Node ID: Status Epilepticus
Node Type: Condition
Node Properties: {}
--------------------------------------------------
Node ID: Tonic-Clonic Seizures
Node Type: Condition
Node Properties: {}
--------------------------------------------------
Node ID: Engel Classification System
Node Type: System
Node Properties: {}
--------------------------------------------------
Node ID: International League Against Epilepsy
Node Type: Organization
Node Properties: {}
--------------------------------------------------
Node ID: Ilae Rating Scale
Node Type: System
Node Properties: {}
--------------------------------------------------
Node ID: Unsupervised Learning
Node Type: Concept
Node Properties: {}
--------------------------------------------------
Node ID: Medical Terminology
Node Type: Concept
Node Properties: {}
--------------------------------------------------
Node ID: Neurological Disorders
Node Type: Concept
Node 

In [17]:
check_query = "MATCH (n) RETURN count(n) AS node_count"
result = graph.query(check_query)
for record in result:
    print(record["node_count"])  # Should print 0 if the database is empty

131


In [18]:
# To create a new database, you can use Cypher query to delete all nodes and relationships
clear_db_query = """
MATCH (n)
DETACH DELETE n
"""

# Execute the query to clear the database
graph.query(clear_db_query)

[]

In [19]:
graph.add_graph_documents(
    graph_documents,
    # Ensures that each entity in graph_documents is labeled with its base entity type
    baseEntityLabel=True,
    # Indicate that the source information (like the original document or context) should be included in the graph nodes or edges.
    include_source=True
)

In [20]:
default_cypher = "MATCH (s)-[r]->(t) WHERE toLower(s.id) CONTAINS 'data' OR toLower(t.id) CONTAINS 'data' RETURN s, r, t"
# You can try other query
# default_cypher = "MATCH (s)-[r:IDENTIFY]->(t) RETURN s,r,t LIMIT 50"

# Function to display graph structure
def showGraph(cypher: str = default_cypher):
    # Create a neo4j session to run queries
    driver = GraphDatabase.driver(
        uri = os.environ["NEO4J_URI"],
        auth = (os.environ["NEO4J_USERNAME"],
                os.environ["NEO4J_PASSWORD"]))
    session = driver.session()
    widget = GraphWidget(graph = session.run(cypher).graph())
    widget.node_label_mapping = 'id'
    return widget

showGraph()

GraphWidget(layout=Layout(height='500px', width='100%'))

In [21]:
showGraph("MATCH p=(d:Document)-[]->() RETURN p LIMIT 25 UNION MATCH p=()-[]->(d:Document) RETURN p;")

GraphWidget(layout=Layout(height='800px', width='100%'))

## Hybrid retriever

In [22]:
vector_index = Neo4jVector.from_existing_graph(
    # Uses a model from OpenAI that converts text into vector embeddings which are used for vector-based search
    OpenAIEmbeddings(),
    # Search for similar words using a hybrid approach, combining both keyword-based and vector-based searches.
    search_type="hybrid",
    # Only nodes with the Document label will be indexed
    node_label="Document",
    # Within the node, we will return the 'text' property
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [23]:
print('Example of the output of similarity search:\n')
# By default the the method will return the top 4 most similar results.
# To tune this, we can add in a new parameter, k = number of results, in the similarity_search function.

def display_matching_strings(results, query_string):
  """Displays page_content only if it contains the query_string from the top 4 search results."""

  for doc in results[:4]:
      if query_string in doc.page_content:
          print("\n this is matching result: " + doc.page_content)
      else:
          print("\n there is no exactly matching strings in the result" + doc.page_content)

# The similarity_search method is used to retrieve documents or nodes based on their vector similarity to a given query.
results = vector_index.similarity_search('Justification', k=4)
# Please note that as we search node labeled as "Document",
# the retrieved results could be very tedious as they are the text relevant to the query_string)
display_matching_strings(results, 'Justification')

Example of the output of similarity search:






 there is no exactly matching strings in the result
text: Course Overview:
Title: Data-Driven Marketing | Description: In today’s environment, marketing or business analysts require tools and techniques to both quantify the strategic value of marketing initiatives, and to maximize marketing campaign performance. This course aims to teach students concepts, methods and tools to demonstrate the return on investment (ROI) of marketing activities and to leverage on data and marketing analytics to make better and more informed marketing decisions. Course topics covered include marketing performance management, marketing metrics, data management, market response and diffusion models, market and customer segmentation models, analytic marketing and value driven segmentation, digital media marketing analytics, etc. Students will have access to | Subject: Computer Science

Related Article 1:
Title: Article 1
Content: text: User behavior analytics (UBA) is a cybersecurity process about detection

## Structure retriever

In [45]:
class Entities(BaseModel):
    """Identifying information about entities."""

    # This line structures the output of the LLM to give a List of names.
    names: List[str] = Field(
        ...,
        description="All the course knowledge, teaching material, deliverable, expectation, level and assessment entities "
        "appear in the text",
    )

# Each tuple represents a message with a specific role and content that helps define how different messages should be strucutured
# and formatted when interacting with the llm.
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are tasked with extracting specific entities from the text. Focus on course knowledge, teaching material, deliverable, expectation, level and assessment entities",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

# Combine the prompt template (prompt) with the language model that specifies that the output should be structured in a particular way, specifically to extract entitites.
entity_chain = prompt | llm.with_structured_output(Entities)

In [46]:
question_step3b2_2 = 'What is data driven marketing and how should i study it'

print('Demonstration that the entity chain can now extract the entities from the users query:\n')
print(entity_chain.invoke({"question": question_step3b2_2}).names)

Demonstration that the entity chain can now extract the entities from the users query:

['data driven marketing', 'study']


In [26]:
graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

[]

In [27]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

In [28]:
import re

def remove_lucene_chars(input: str) -> str:
    """
    Remove special characters that are not allowed in Lucene queries.
    """
    return re.sub(r'[^a-zA-Z0-9\s]', '', input)

def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspellings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]

    if not words:
        return ""

    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"

    return full_text_query.strip()

In [29]:
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question.
    """
    result = ""
    entities = entity_chain.invoke({"question": question})

    for entity in entities.names:
        # This Neo4j Cypher query performs a full-text search on nodes that have the required label, retrieving the top two matches
        # based on the search term provided. After this, the query then looks for relationships that point to or from this entity,
        # excluding relationships of type 'MENTIONS'.
        response = graph.query(
            """
            CALL db.index.fulltext.queryNodes('entity', $query, {limit: 2})
            YIELD node, score
            WITH node
            MATCH (node)-[r]->(neighbor)
            WHERE type(r) <> 'MENTIONS'
            RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
            UNION ALL
            MATCH (neighbor)-[r]->(node)
            WHERE type(r) <> 'MENTIONS'
            RETURN neighbor.id + ' - ' + type(r) + ' -> ' + node.id AS output
            LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )

        # Append results
        result += "\n".join([el['output'] for el in response]) + "\n"

    return result.strip()


In [None]:
question_step3b2_2 = 'what are the key knowledge behind data driven marketing?'
print('Example of the output of a structured retriever: \n')
print(structured_retriever(question_step3b2_2))

Example of the output of a structured retriever: 

Data-Driven Marketing - SUBJECT -> Computer Science
Status Epilepticus - RELATED_TO -> Tonic-Clonic Seizures
Engel Classification System - EVALUATED_BY -> International League Against Epilepsy
Ilae Rating Scale - DEVELOPED_BY -> International League Against Epilepsy
Unsupervised Learning - INCLUDES -> Neural Networks
Unsupervised Learning - INCLUDES -> Probabilistic Methods
Unsupervised Learning - INCLUDES -> Clustering
Unsupervised Learning - INCLUDES -> Anomaly Detection
Unsupervised Learning - INCLUDES -> Latent Variable Models
Unsupervised Learning - EXCLUDES -> Backpropagation
Unsupervised Learning - EMPLOYS -> Hopfield Learning Rule
Unsupervised Learning - EMPLOYS -> Boltzmann Learning Rule
Unsupervised Learning - EMPLOYS -> Contrastive Divergence
Unsupervised Learning - EMPLOYS -> Wake Sleep
Unsupervised Learning - EMPLOYS -> Variational Inference
Unsupervised Learning - EMPLOYS -> Maximum Likelihood
Unsupervised Learning - EMPL

In [31]:
# Define a function to combine both structured and unstructred data defined above into a prompt to be fed to the LLM
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [48]:
question_step3b2_3 = 'what are the key knowledge behind data driven marketing?'
print('Example of the output of final retriever: \n')
print(retriever(question_step3b2_3))

Example of the output of final retriever: 

Search query: what are the key knowledge behind data driven marketing?




Structured data:
Status Epilepticus - RELATED_TO -> Tonic-Clonic Seizures
Engel Classification System - EVALUATED_BY -> International League Against Epilepsy
Ilae Rating Scale - DEVELOPED_BY -> International League Against Epilepsy
Unsupervised Learning - INCLUDES -> Neural Networks
Unsupervised Learning - INCLUDES -> Probabilistic Methods
Unsupervised Learning - INCLUDES -> Clustering
Unsupervised Learning - INCLUDES -> Anomaly Detection
Unsupervised Learning - INCLUDES -> Latent Variable Models
Unsupervised Learning - EXCLUDES -> Backpropagation
Unsupervised Learning - EMPLOYS -> Hopfield Learning Rule
Unsupervised Learning - EMPLOYS -> Boltzmann Learning Rule
Unsupervised Learning - EMPLOYS -> Contrastive Divergence
Unsupervised Learning - EMPLOYS -> Wake Sleep
Unsupervised Learning - EMPLOYS -> Variational Inference
Unsupervised Learning - EMPLOYS -> Maximum Likelihood
Unsupervised Learning - EMPLOYS -> Maximum A Posteriori
Unsupervised Learning - EMPLOYS -> Gibbs Sampling
Unsuperv

## RAG Chain

In [33]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

# Formats chat history to incorporate into a query for the LLM
def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [50]:
demo_question = 'Can you help me make a study plan for the course?'

### Unstructure retriever

In [51]:
def just_unstructured_retriever(question: str):
    print(f"Search query: {question}")
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

# Prompt Augumentation: it instructs the model to answer a question using only the context provided.
template = """Answer the question based only on the following context:
{context}
Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# LLM Generation by running two operations in parallel: retrieve context and passthrough quesiton
unstructured_chain = (
    RunnableParallel(
        {
            "context": _search_query | just_unstructured_retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [52]:
print('Example of the retrieval output fed to the LLM: \n')
print(just_unstructured_retriever(demo_question))

Example of the retrieval output fed to the LLM: 

Search query: Can you help me make a study plan for the course?





Unstructured data:

text:  STDP). Hebbian Learning has been hypothesized to underlie a range of cognitive functions, such as pattern recognition and experiential learning.

Among neural network models, the self-organizing map (SOM) and adaptive resonance theory (ART) are commonly used in unsupervised learning algorithms. The SOM is a topographic organization in which nearby locations in the map represent inputs with similar properties. The ART model allows the number of clusters to vary with problem size and lets the user control the degree of similarity between members of the same clusters by means of a user-defined constant called the vigilance parameter. ART networks are used for many pattern recognition tasks, such as automatic target recognition and seismic signal processing.

Probabilistic methods 
Two of the main methods used in unsupervised learning are principal component and cluster analysis. Cluster analysis is used in unsupervised learning to group, or segment, datasets wi

In [53]:
print('Unstructured retrieval model response: \n')
response = unstructured_chain.invoke({"question": demo_question})
wrapped_response = textwrap.fill(response, width=80)
print(wrapped_response)

Unstructured retrieval model response: 

Search query: Can you help me make a study plan for the course?




Certainly! Here's a concise study plan for your Data-Driven Marketing course:
1. **Week 1-2: Foundations of Data-Driven Marketing**    - Focus on
understanding the basic concepts and the strategic value of marketing
initiatives.    - Read about marketing performance management and marketing
metrics.  2. **Week 3-4: Data Management and Analysis**    - Dive into data
management techniques, learning how to organize and interpret marketing data.
- Practice with real datasets if possible, focusing on market response and
diffusion models.  3. **Week 5-6: Market and Customer Segmentation**    - Study
market and customer segmentation models.    - Apply these concepts to case
studies or real-world marketing scenarios.  4. **Week 7-8: Analytic Marketing
and Value-Driven Segmentation**    - Explore advanced topics in analytic
marketing.    - Work on projects that require you to segment markets based on
value-driven parameters.  5. **Week 9-10: Digital Media Marketing Analytics**
- Learn about the

### Structured RAG

In [54]:
def just_structured_retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
   # unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
    """
    return final_data

# Prompt Augumentation: it instructs the model to answer a question using only the context provided.
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# LLM Generation by running two operations in parallel: retrieve context and passthrough quesiton
structured_chain = (
    RunnableParallel(
        {
            "context": _search_query | just_structured_retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [55]:
print('Example of the retrieval output fed to the LLM: \n')
print(just_structured_retriever(demo_question))

Example of the retrieval output fed to the LLM: 

Search query: Can you help me make a study plan for the course?
Structured data:
Status Epilepticus - RELATED_TO -> Tonic-Clonic Seizures
Engel Classification System - EVALUATED_BY -> International League Against Epilepsy
Ilae Rating Scale - DEVELOPED_BY -> International League Against Epilepsy
Unsupervised Learning - INCLUDES -> Neural Networks
Unsupervised Learning - INCLUDES -> Probabilistic Methods
Unsupervised Learning - INCLUDES -> Clustering
Unsupervised Learning - INCLUDES -> Anomaly Detection
Unsupervised Learning - INCLUDES -> Latent Variable Models
Unsupervised Learning - EXCLUDES -> Backpropagation
Unsupervised Learning - EMPLOYS -> Hopfield Learning Rule
Unsupervised Learning - EMPLOYS -> Boltzmann Learning Rule
Unsupervised Learning - EMPLOYS -> Contrastive Divergence
Unsupervised Learning - EMPLOYS -> Wake Sleep
Unsupervised Learning - EMPLOYS -> Variational Inference
Unsupervised Learning - EMPLOYS -> Maximum Likelihood


In [56]:
print('Structured retrieval model response: \n')
response = structured_chain.invoke({"question": demo_question})
wrapped_response = textwrap.fill(response, width=80)
print(wrapped_response)

Structured retrieval model response: 

Search query: Can you help me make a study plan for the course?
Certainly! Here's a concise study plan for your course based on the provided
context:  1. **Introduction to Unsupervised Learning**:    - Understand the
basics of unsupervised learning, including its definition and how it contrasts
with supervised learning.    - Explore the types of problems unsupervised
learning aims to solve, such as clustering, anomaly detection, and density
estimation.  2. **Key Concepts and Algorithms**:    - Dive into the various
algorithms employed in unsupervised learning, including the Hopfield Learning
Rule, Boltzmann Learning Rule, Contrastive Divergence, and others like
Variational Inference and Gibbs Sampling.    - Study the applications and
limitations of each algorithm.  3. **Neural Networks and Probabilistic
Methods**:    - Learn about the role of neural networks and probabilistic
methods within unsupervised learning.    - Understand the exclusion of
b

### Hybrid Retriever

In [57]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

# LLM Generation by running two operations in parallel: retrieve context and passthrough quesiton
final_chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [58]:
print('Example of the retrieval output fed to the LLM: \n')
print(retriever(demo_question))

Example of the retrieval output fed to the LLM: 

Search query: Can you help me make a study plan for the course?




Structured data:
Status Epilepticus - RELATED_TO -> Tonic-Clonic Seizures
Engel Classification System - EVALUATED_BY -> International League Against Epilepsy
Ilae Rating Scale - DEVELOPED_BY -> International League Against Epilepsy
Unsupervised Learning - INCLUDES -> Neural Networks
Unsupervised Learning - INCLUDES -> Probabilistic Methods
Unsupervised Learning - INCLUDES -> Clustering
Unsupervised Learning - INCLUDES -> Anomaly Detection
Unsupervised Learning - INCLUDES -> Latent Variable Models
Unsupervised Learning - EXCLUDES -> Backpropagation
Unsupervised Learning - EMPLOYS -> Hopfield Learning Rule
Unsupervised Learning - EMPLOYS -> Boltzmann Learning Rule
Unsupervised Learning - EMPLOYS -> Contrastive Divergence
Unsupervised Learning - EMPLOYS -> Wake Sleep
Unsupervised Learning - EMPLOYS -> Variational Inference
Unsupervised Learning - EMPLOYS -> Maximum Likelihood
Unsupervised Learning - EMPLOYS -> Maximum A Posteriori
Unsupervised Learning - EMPLOYS -> Gibbs Sampling
Unsuperv

In [59]:
print('Hybrid retrieval model response: \n')
response = final_chain.invoke({"question": demo_question})
wrapped_response = textwrap.fill(response, width=80)
print(wrapped_response)

Hybrid retrieval model response: 

Search query: Can you help me make a study plan for the course?




Certainly! Here's a concise study plan for your Data-Driven Marketing course:
1. **Week 1-2: Introduction to Data-Driven Marketing**    - Understand the
basics of data-driven marketing.    - Explore marketing performance management
and marketing metrics.  2. **Week 3-4: Data Management and Analysis**    - Dive
into data management techniques.    - Learn about market response and diffusion
models.  3. **Week 5-6: Market and Customer Segmentation**    - Study market and
customer segmentation models.    - Practice identifying segments using
unsupervised learning methods like clustering.  4. **Week 7-8: Analytic
Marketing and Segmentation**    - Explore advanced analytic marketing
strategies.    - Understand value-driven segmentation and its application.  5.
**Week 9-10: Digital Media Marketing Analytics**    - Learn about digital media
marketing analytics.    - Study the role of unsupervised learning in analyzing
digital marketing data.  6. **Week 11-12: Leveraging Big Data for Marketing*

In [61]:
question_step3b3_2 = "Can you explain more about the connection between data-driven marketing and unsupervised learning, what kind of unsupervised learning is used for data-driven marketing"
previous_qn = demo_question
previous_res = wrapped_response
final_chain.invoke(
    {
        "question": question_step3b3_2,
        "chat_history": [(previous_qn, previous_res)],
    }
)

Search query: What kind of unsupervised learning is used for data-driven marketing and can you explain more about the connection between data-driven marketing and unsupervised learning?




'In the context of data-driven marketing, unsupervised learning is primarily utilized to uncover hidden patterns and insights from marketing data without pre-existing labels. This approach is crucial for segmenting markets and customers, identifying distinct groups within the data based on shared attributes or behaviors. Techniques such as clustering are commonly employed in this domain to group consumers with similar characteristics or purchasing behaviors, enabling marketers to tailor their strategies more effectively. Additionally, unsupervised learning methods like anomaly detection can help in identifying outliers or unusual patterns in the data, which could signify emerging trends, fraud, or untapped market segments. By leveraging these unsupervised learning techniques, data-driven marketing can achieve more personalized marketing campaigns, efficient targeting, and improved customer understanding, all of which contribute to maximizing the ROI of marketing activities.'