# Wikipedia-based Q&A using conversational RAG

In [1]:
!pip install lxml



In [2]:
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

url = "https://en.wikipedia.org/wiki/Uppsala_University"

# Specify which headers to split on 
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# The splited html file based on the above headers
html_header_splits = html_splitter.split_text_from_url(url)

# When chunk_size = 500, len(document_splits) = 159 for UU wikipage 
# chunk_size = 5000 
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#    chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

# The final splited document
# document_splits = text_splitter.split_documents(html_header_splits)

In [3]:
print(html_header_splits[3])

page_content="As with most medieval universities, Uppsala University initially grew out of an ecclesiastical centre.[10] The archbishop of Uppsala had been one of the most important sees in Sweden proper since Christianity first spread to this region in the ninth century. Uppsala had also long been a hub for regional trade and had contained settlements dating back into the deep Middle Ages. As was also the case with most medieval universities, Uppsala had initially been chartered through a papal bull. Uppsala's bull, which granted the university its corporate rights, was issued by Pope Sixtus IV in 1477 and established several provisions. Among the most important of these was that the university was officially given the same freedoms and privileges as the University of Bologna. This included the right to establish the four traditional faculties of theology, law (Canon Law and Roman law), medicine, and philosophy, and to award the bachelor's, master's, licentiate, and doctoral degrees. 

In [4]:
len(html_header_splits)

41

In [5]:
#print(document_splits[20:25])

In [6]:
# len(document_splits)

In [7]:
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama2")

In [8]:
from langchain_community.vectorstores import FAISS

vector = FAISS.from_documents(html_header_splits, embeddings)

In [9]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")

In [10]:
llm.invoke("How many nations are there at Uppsala University?")

"\nUppsala University is a multi-national university, with students and staff from all over the world. While the exact number of nations represented at the university can vary depending on the academic year, I can provide you with some general information on the diversity of the student body at Uppsala University.\n\nAccording to the university's website, international students make up around 20% of the student population. The largest groups of international students come from:\n\n1. Sweden: The majority of students at Uppsala University are Swedish citizens.\n2. Europe: Students from other European countries, such as Norway, Denmark, Finland, and Germany, make up a significant proportion of the student body.\n3. Asia: Students from countries such as China, Japan, South Korea, and India also attend Uppsala University.\n4. Africa: Students from countries such as Nigeria, Ghana, and Kenya are represented at the university.\n5. Other regions: Uppsala University also attracts students from

In [11]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder 


# Frist we need a prompt that we can pass into an LLM to generate this search query
prompt_search_query = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"), 
    ("user", "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation")
])

retriever = vector.as_retriever()

# A new chain takes in the most recent input and the coversation history and use an LLM to generate a search query
retriever_chain = create_history_aware_retriever(llm, retriever, prompt_search_query)

In [12]:
# Test retriever chain
retriever_chain.invoke({
   "chat_history": [], 
   "input": "How many nations are there at Uppsala University?"
})

[Document(page_content="Uppsala University also hosts the Forum for South Asia Studies, a collaborative academic effort by its six faculties: Theology, Law, History and Philosophy, Social Sciences, Languages, and Educational Sciences. The Forum aims to facilitate and promote research and education related to the South Asian countries: India, Pakistan, Sri Lanka, Nepal, Bangladesh, Maldives and Afghanistan, on the national and international level, with Ferdinando Sardella, Faculty of Theology, serving as the Forum's director.[17]", metadata={'Header 1': 'Uppsala University', 'Header 2': 'Administration and organisation[edit]', 'Header 3': 'Faculties[edit]', 'Header 4': 'Other[edit]'}),
 Document(page_content='Through the division of faculties and the addition of a previously independent school of Pharmacy as a new faculty, the traditional four-faculty organization of European universities has evolved into the present nine faculties. The disciplinary domains and their faculties are as de

In [13]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Create a new chain to continue the conversation with these retrieved documents in mind
prompt_answer = ChatPromptTemplate.from_messages([
    ("system", "Answer the user's quesitions based on the below context:\n\n{context}"), 
    MessagesPlaceholder(variable_name="chat_history"),
    ("user", "{input}"),
])
document_chain = create_stuff_documents_chain(llm, prompt_answer)

retrieval_chain = create_retrieval_chain(retriever_chain, document_chain)

In [14]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []
human_input = "How many nations are there at Uppsala University?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

Based on the information provided in the text, there are 13 nations at Uppsala University. These include:

1. Stockholms nation
2. Uplands nation
3. Gästrike-Hälsinge nation
4. Östgöta nation
5. Västgöta nation
6. Södermanlands-Nerikes nation
7. Västmanlands-Dala nation
8. Smålands nation
9. Göteborgs nation
10. Kalmar nation
11. Värmlands nation
12. Norrlands nation
13. Gotlands nation


In [15]:
chat_history

[HumanMessage(content='How many nations are there at Uppsala University?'),
 AIMessage(content='Based on the information provided in the text, there are 13 nations at Uppsala University. These include:\n\n1. Stockholms nation\n2. Uplands nation\n3. Gästrike-Hälsinge nation\n4. Östgöta nation\n5. Västgöta nation\n6. Södermanlands-Nerikes nation\n7. Västmanlands-Dala nation\n8. Smålands nation\n9. Göteborgs nation\n10. Kalmar nation\n11. Värmlands nation\n12. Norrlands nation\n13. Gotlands nation')]

In [16]:
# Test retriever chain
retriever_chain.invoke({
   "chat_history": chat_history, 
       "input": "Is it mandatory to join a nation?"
})

[Document(page_content="Uppsala University also hosts the Forum for South Asia Studies, a collaborative academic effort by its six faculties: Theology, Law, History and Philosophy, Social Sciences, Languages, and Educational Sciences. The Forum aims to facilitate and promote research and education related to the South Asian countries: India, Pakistan, Sri Lanka, Nepal, Bangladesh, Maldives and Afghanistan, on the national and international level, with Ferdinando Sardella, Faculty of Theology, serving as the Forum's director.[17]", metadata={'Header 1': 'Uppsala University', 'Header 2': 'Administration and organisation[edit]', 'Header 3': 'Faculties[edit]', 'Header 4': 'Other[edit]'}),
 Document(page_content='Through the division of faculties and the addition of a previously independent school of Pharmacy as a new faculty, the traditional four-faculty organization of European universities has evolved into the present nine faculties. The disciplinary domains and their faculties are as de

In [17]:
human_input = "Is it mandatory to join a nation?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

AI: According to the text, joining a nation is not mandatory for students at Uppsala University. It states that "The traditional four-faculty organization of European universities has evolved into the present nine faculties" and "The disciplinary domains and their faculties are as depicted below," without mentioning any requirement to join a nation. Therefore, it is not mandatory to join a nation at Uppsala University.


In [18]:
chat_history

[HumanMessage(content='How many nations are there at Uppsala University?'),
 AIMessage(content='Based on the information provided in the text, there are 13 nations at Uppsala University. These include:\n\n1. Stockholms nation\n2. Uplands nation\n3. Gästrike-Hälsinge nation\n4. Östgöta nation\n5. Västgöta nation\n6. Södermanlands-Nerikes nation\n7. Västmanlands-Dala nation\n8. Smålands nation\n9. Göteborgs nation\n10. Kalmar nation\n11. Värmlands nation\n12. Norrlands nation\n13. Gotlands nation'),
 HumanMessage(content='Is it mandatory to join a nation?'),
 AIMessage(content='AI: According to the text, joining a nation is not mandatory for students at Uppsala University. It states that "The traditional four-faculty organization of European universities has evolved into the present nine faculties" and "The disciplinary domains and their faculties are as depicted below," without mentioning any requirement to join a nation. Therefore, it is not mandatory to join a nation at Uppsala Univer

In [19]:
human_input = "When did the compulsory membership end?"

ai_msg = retrieval_chain.invoke({
    "chat_history": chat_history,
    "input": human_input})
print(ai_msg["answer"])

chat_history.extend([HumanMessage(content=human_input), AIMessage(content=ai_msg['answer'])])

The information provided in the text does not mention when the compulsory membership of nations ended at Uppsala University. In fact, the text does not provide any information on the topic of compulsory membership of nations at all. Therefore, I cannot answer this question based on the information provided.
