In [41]:
# imports

import os
import glob
import gradio as gr
from google.colab import userdata
from IPython.display import Markdown, display

In [16]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
MODEL = 'gpt-4o-mini'
db_name = 'vector_db'

In [13]:
openapikey = userdata.get('OPENAI_API_KEY')

os.environ["OPENAI_API_KEY"] = openapikey

In [21]:
pdf_path = "/content/Constitution.pdf"

# Load the PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"Total number of chunks: {len(chunks)}")
print(f"Document metadata sample: {documents[0].metadata}")
print(f"First chunk content: {chunks[0].page_content[:500]}")

Total number of chunks: 225
Document metadata sample: {'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2023-03-10T15:40:40+05:00', 'title': '', 'author': 'Naveed Anjum', 'moddate': '2023-03-10T15:40:40+05:00', 'source': '/content/Constitution.pdf', 'total_pages': 225, 'page': 0, 'page_label': '1'}
First chunk content: THE 
CONSTITUTION 
OF THE 
ISLAMIC REPUBLIC 
OF 
PAKISTAN 
 
 
 
 
 
 
[As modified upto the  31st May, 2018] 
 
 
 
 
 
 
NATIONAL ASSEMBLY OF PAKISTAN


In [25]:
embeddings = OpenAIEmbeddings()

if os.path.exists(db_name):
  Chroma.delete_collection(db_name)

vectorstore = Chroma.from_documents(documents= chunks, embedding= embeddings, collection_name=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 225 documents


In [27]:
collection = vectorstore._collection
count = collection.count()

sample_embeddings = collection.get(limit=1, include=['embeddings'])['embeddings'][0]
dimensions = len(sample_embeddings)

print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 225 vectors with 1,536 dimensions in the vector store


## Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [30]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = ["constitution" for _ in metadatas]
colors = ["blue" for _ in doc_types]

In [31]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [32]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Time to use LangChain to bring it all together

In [33]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 3.5 LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [42]:
# Let's try a simple question

query = "Describe the constitution of Pakistan"
result = conversation_chain.invoke({"question": query})
display(Markdown(result["answer"]))

The key features of the Constitution of Pakistan include:

1. **Federal Structure**: Pakistan is defined as a Federal Republic, comprising provinces and territories, with a system that allows for the admission of new states and areas.

2. **Islam as the State Religion**: The Constitution declares Islam as the state religion and emphasizes the importance of Islamic principles in governance.

3. **Fundamental Rights**: It guarantees fundamental rights to the citizens, including equality before law, freedom of speech, freedom of religion, and protection against discrimination.

4. **Independence of Judiciary**: The Constitution ensures the independence of the judiciary to uphold justice and the rule of law.

5. **Objectives Resolution**: The principles set out in the Objectives Resolution are included as substantive provisions, guiding the governance of the state.

6. **Protection of Minorities**: Adequate provisions are made for the protection of the rights and interests of minorities, allowing them to freely practice their religions and develop their cultures.

7. **Democratic Principles**: The Constitution establishes a framework for democracy, emphasizing the role of elected representatives and the importance of social justice.

8. **Sovereignty and Territorial Integrity**: It underscores the sovereignty of Pakistan and the importance of maintaining the integrity of its territories.

9. **Promotion of Local Government**: The Constitution promotes the establishment of local government institutions to ensure effective governance at the grassroots level.

These features reflect the foundational principles upon which the governance and legal framework of Pakistan are built.

In [43]:
# set up a new conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

## Now we will bring this up in Gradio using the Chat interface -

A quick and easy way to prototype a chat with an LLM

In [44]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [45]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a9706612b8e4e28b16.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
