In [7]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")



In [8]:
import config

api_key = config.LANGCHAIN_API_KEY
project = config.LANGCHAIN_PROJECT
endpoint = config.LANGCHAIN_ENDPOINT
tracing_v2 = config.LANGCHAIN_TRACING_V2


# Document Load

In [9]:
import os

pdfs = [
    os.path.join(root, file)
    for root, dirs, files in os.walk('Data')
    for file in files
    if file.endswith('.pdf')
]

print(pdfs)


['Data/26th Ammedment.pdf', 'Data/till 25th amendment.pdf']


In [10]:
from langchain_community.document_loaders import PyMuPDFLoader

docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()

    docs.extend(pages)

# Document Chunking


In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = text_splitter.split_documents(docs)

In [12]:
len(docs), len(chunks)


(240, 651)

In [13]:
len(docs[0].page_content), len(chunks[0].page_content)


(1657, 969)

In [14]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

len(encoding.encode(docs[0].page_content)), len(encoding.encode(chunks[0].page_content))

(418, 246)

# Document Vector Embedding


In [15]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [16]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

single_vector = embeddings.embed_query("this is some text data")


In [17]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d

(0, 768)

In [18]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [19]:
ids = vector_store.add_documents(documents=chunks)


In [20]:
vector_store.index_to_docstore_id
len(ids)

651

# Retrivel

In [21]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs = {'k': 3, 
                                                                          'fetch_k': 100,
                                                                          'lambda_mult': 1})

# RAG with DEEPSEEK 1.5B on OLLAMA


In [22]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

from langchain_ollama import ChatOllama

In [23]:
#model = ChatOllama(model="deepseek-r1:1.5b", base_url="http://localhost:11434")

#model.invoke("hi")

In [24]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    - If the answer is not within the context, clearly state that you don't know.
    - Refuse to answer questions that are irrelevant to the provided context.
    - Answer in bullet points for clarity.
    - Make sure your answer is relevant to the question and derived solely from the given data that I provided.
    Question: {question}
    Context: {context}
    Answer:
"""


prompt = ChatPromptTemplate.from_template(prompt)

In [25]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

# print(format_docs(docs))

In [None]:
#question = "what is used to increase mass of the Earth?"
#question = "What is Amendment of Article 175A? ) For appointment cf Judges of the Supreme Court"
#output = rag_chain.invoke(question)
#print(output)

In [None]:
# # store vector database
#db_name = "Pak_CONSTITUTION"
#vector_store.save_local(db_name)

# # load vector database
# new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)
# len(new_vector_store.index_to_docstore_id)

# llama3-1B


In [27]:
model_llama = ChatOllama(model="llama3.2:1b", base_url="http://localhost:11434")

model_llama.invoke("hi")

AIMessage(content='How can I help you today?', additional_kwargs={}, response_metadata={'model': 'llama3.2:1b', 'created_at': '2025-02-10T10:14:24.934577578Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1940421463, 'load_duration': 1747490649, 'prompt_eval_count': 26, 'prompt_eval_duration': 122000000, 'eval_count': 8, 'eval_duration': 69000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-2f4eec5d-1e5a-4c96-8e70-7abb51472ecf-0', usage_metadata={'input_tokens': 26, 'output_tokens': 8, 'total_tokens': 34})

In [29]:
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model_llama
    | StrOutputParser()
)

In [30]:
#question = "what is used to increase mass of the Earth?"
question = "What is Amendment of Article 175A? ) For appointment of Judges of the Supreme Court also share when this amendment happened"
output = rag_chain.invoke(question)
print(output)

I can provide the information in the following bullet points:

• Amendment to Article 175A related to appointment of judges for the Supreme Court is not mentioned.
• The context provided focuses on constitutional amendments, High Courts, and the Supreme Court, specifically Articles 202A, 177, and 179.


In [31]:
import faiss
import pickle
import streamlit as st

# Directory to save the model
MODEL_DIR = "trained_model"
os.makedirs(MODEL_DIR, exist_ok=True)

# Save FAISS index
faiss.write_index(index, os.path.join(MODEL_DIR, "faiss_index.bin"))

# Save metadata (document mappings)
with open(os.path.join(MODEL_DIR, "faiss_metadata.pkl"), "wb") as f:
    pickle.dump(vector_store.index_to_docstore_id, f)

st.success("Model trained and saved successfully!")


2025-02-10 10:15:08.496 
  command:

    streamlit run /home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()