# Wikipedia-based Q&A using conversational RAG

In [1]:
!pip install lxml



In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

# Access the OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [3]:
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

url = "https://en.wikipedia.org/wiki/Uppsala_University"

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# for local file use html_splitter.split_text_from_file(<path_to_file>)
html_header_splits = html_splitter.split_text_from_url(url)

# When chunk_size = 500, len(document_splits) = 159 for UU wikipage 
# chunk_size = 5000 
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

# Split
# document_splits = text_splitter.split_documents(html_header_splits)

In [4]:
#print(html_header_splits[2:5])

In [5]:
#print(document_splits[20:25])

In [6]:
# len(document_splits)

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [8]:
from langchain_community.vectorstores import FAISS

vector = FAISS.from_documents(html_header_splits, embeddings)

In [9]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4")

In [10]:
llm.invoke("How many nations are there at Uppsala University?")

AIMessage(content='There are 13 student nations at Uppsala University.', response_metadata={'finish_reason': 'stop', 'logprobs': None})

In [11]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder 


# Frist we need a prompt that we can pass into an LLM to generate this search query
prompt_search_query = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"), 
    ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
])

retriever = vector.as_retriever()

# A new chain takes in the most recent input and the coversation history and use an LLM to generate a search query
retriever_chain = create_history_aware_retriever(llm, retriever, prompt_search_query)

In [12]:
# Test retriever chain
retriever_chain.invoke({
    "chat_history": [], 
    "input": "How many nations are there at Uppsala University?"
})

[Document(page_content='Up until June 2010, students at Uppsala University were obliged to become members of one of the nations, corporations of students traditionally according to the province of origin (not strictly upheld now, for practical reasons). The system of dividing students into nations according to origin can ultimately be traced back to the nations at the medieval University of Paris and other early medieval universities, but the Uppsala nations appear only about 1630–1640, most likely under influence of the Landsmannschaften which existed at some of the German universities visited by Swedish students. In Sweden, nations exist only in Uppsala and Lund. The nations were originally seen as subversive organisations promoting less virtuous aspects of student life, but in 1663 the consistory made membership in a nation legal, each nation being placed under the inspectorship of a professor.  \nThe current thirteen nations all have a history stretching back to the early-to-mid 17

In [13]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Create a new chain to continue the conversation with these retrieved documents in mind
prompt_answer = ChatPromptTemplate.from_messages([
    ("system", "Answer the user's quesitions based on the below context:\n\n{context}"), 
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])
document_chain = create_stuff_documents_chain(llm, prompt_answer)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

In [14]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []
human_input = "How many nations are there at Uppsala University?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

There are thirteen traditional nations at Uppsala University. However, since the 1960s there was a fourteenth nation, the Skånelandens nation, which was made redundant in 2010.


In [15]:
chat_history

[HumanMessage(content='How many nations are there at Uppsala University?'),
 AIMessage(content='There are thirteen traditional nations at Uppsala University. However, since the 1960s there was a fourteenth nation, the Skånelandens nation, which was made redundant in 2010.')]

In [16]:
# Test retriever chain
retriever_chain.invoke({
    "chat_history": chat_history, 
    "input": "Is it mandatory to join a nation?"
})

[Document(page_content='Up until June 2010, students at Uppsala University were obliged to become members of one of the nations, corporations of students traditionally according to the province of origin (not strictly upheld now, for practical reasons). The system of dividing students into nations according to origin can ultimately be traced back to the nations at the medieval University of Paris and other early medieval universities, but the Uppsala nations appear only about 1630–1640, most likely under influence of the Landsmannschaften which existed at some of the German universities visited by Swedish students. In Sweden, nations exist only in Uppsala and Lund. The nations were originally seen as subversive organisations promoting less virtuous aspects of student life, but in 1663 the consistory made membership in a nation legal, each nation being placed under the inspectorship of a professor.  \nThe current thirteen nations all have a history stretching back to the early-to-mid 17

In [17]:
human_input = "Is it mandatory to join a nation?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

No, it is not mandatory to join a nation. Up until June 2010, students at Uppsala University were obliged to join one of the nations. However, this requirement was abolished in 2010.


In [18]:
chat_history

[HumanMessage(content='How many nations are there at Uppsala University?'),
 AIMessage(content='There are thirteen traditional nations at Uppsala University. However, since the 1960s there was a fourteenth nation, the Skånelandens nation, which was made redundant in 2010.'),
 HumanMessage(content='Is it mandatory to join a nation?'),
 AIMessage(content='No, it is not mandatory to join a nation. Up until June 2010, students at Uppsala University were obliged to join one of the nations. However, this requirement was abolished in 2010.')]

In [19]:
human_input = "When did the compulsory membership end?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

The compulsory membership in a student union ended on 1 July 2010.
