# **Installing required dependencies**

In [None]:
!pip install PyMuPDF
!pip install faiss-gpu
!pip install transformers
!pip install pdfplumber
!pip install sentence-transformers
!pip install bitsandbytes
!pip install huggingface_hub
!pip install flask
!pip install flask-cors
!pip install pyngrok



Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

# **Importing necessary libraries and Authenticating with Hugging Face Hub**

In [None]:
from huggingface_hub import login
import pdfplumber
import faiss
from sentence_transformers import SentenceTransformer
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Authenticate with Hugging Face Hub
login(token=" ") #Replace with your HuggingFaceHub Access Token



# **Extracting text from PDF**

In [None]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

pdf_path = " " #Replace your pdf path
document_text = extract_text_from_pdf(pdf_path)
print("Extracted Text (Preview):", document_text[:500])  # Display the first 500 characters


Extracted Text (Preview): THE CENTRAL MOTOR VEHICLES RULES, 19891
CHAPTER I
PRELIMINARY
1. Short title and commencement.—(1) These rules may be called the Central Motor
Vehicle Rules, 1989.
(2) Save as otherwise provided in sub-rule (3) 2[and sub-rule (2) of rule 103,] these rules
shall come into force on the 1st day of July, 1989.
(3) The provisions of 2[rule 9,] sub-rule (3) of rule 16, sub-rule (4) of rule 96, 3[* * *] sub-
rule (3) of rule 105, rule 113, sub-rules (2), (3), (4) or (5) of rule 115, rules
118,122,124,1


# **Splitting Text into Manageable Chunks**

In [None]:
def split_text_into_chunks(text, chunk_size=512):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

chunks = split_text_into_chunks(document_text)

# **Initialising Embedding Model and Generating Embeddings**

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = embedding_model.encode(chunks, convert_to_tensor=False)

# **Creating and Populating the FAISS Index.**

In [None]:
res=faiss.StandardGpuResources()
dimension = embeddings.shape[1]
index_flat = faiss.IndexFlatL2(dimension)
gpu_index = faiss.index_cpu_to_gpu(res, 0, index_flat)
gpu_index.add(embeddings)

print(f"FAISS index created with {gpu_index.ntotal} vectors.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index created with 731 vectors.


# **Loading and Quantizing the Model.**


*   We've used 'Llama-3.1-8B-Instruct' model for generating responses.
*   Used BitsAndBytesConfig for quantizing the model.
* Loading in 8bit is consumes **9GB of GPU memory** and the size of the **loaded model is 8GB.**
* Meanwhile, loading in 16bit consumes **7GB of GPU memory** and size of **loaded model is 5.8(~6GB).**

* So, load the model based on your requirements.



In [None]:
# Quantization configuration
quantization_config = BitsAndBytesConfig(load_in_16bit=True)

# Define the model name
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=quantization_config,
    device_map="cuda:0",
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)




Unused kwargs: ['load_in_16bit']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

# **Initializing the Pipeline for Text Generation.**

* Do adjust the temperature(>0.5 gives more creative answer and <0.5 gives less creative answers) based on the creativity needed to generate the answers.

In [None]:
llm = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    temperature=0.5,
    repetition_penalty=1.2,
    max_new_tokens=300,
)

Device set to use cuda:0


# **Defining the Prompt Template for the Language model.**

In [None]:
PROMPT_TEMPLATE = """
You are a highly intelligent assistant tasked with answering only the specific question asked, based strictly on the provided context. Follow these rules without exception:

1. Answer the question provided accurately based only on the given context. Do not include or generate any additional questions or commentary.
2. Do not provide any extra details, comments, or explanations beyond what is strictly required to answer the question.
3. If the answer is not found in the context, respond only with: "No answer found."
4. Avoid generating new questions or assumptions. Respond only to the question explicitly stated.
5. Strictly return answer to the given question from context no need to return context and question simply return correct answer in well structured paragraph.
Context:{context}

Question:{question}


Answer:

"""

# **Retrieving relevant chunks and generating the answer.**

In [None]:
# Function to retrieve relevant chunks
def retrieve_relevant_chunks(query, k=5):
    query_embedding = embedding_model.encode([query], convert_to_tensor=False)
    distances, indices = gpu_index.search(query_embedding, k)
    return [chunks[i] for i in indices[0]]

# Initialize a cache to store previously answered questions and their answers
cache = {}
def generate_answer(query):
    # Check if the query is already in the cache
    if query in cache:
        print("Answer retrieved from cache:")
        return cache[query]

    # If not in cache, generate a new answer
    print("Processing query for the first time:")
    relevant_chunks = retrieve_relevant_chunks(query)
    context = " ".join(relevant_chunks)

    # Prepare input for the model
    final_prompt = PROMPT_TEMPLATE.format(question=query, context=context)
    response = llm(final_prompt)
    answer = response[0]["generated_text"]
    answer = answer.split("Answer:")[-1].strip()

    # Store the answer in the cache
    cache[query] = answer
    return answer


# **Testing Answer Generation.**

**Question:** What is the purpose of medical certificate?

**Answer:** The medical certificate serves multiple purposes including being used as part of applications for learner’s license or driver’s licenses, especially when applying for renewals or adding classes of motor vehicles to existing licenses. Additionally, it provides proof of physical fitness for applicants seeking licenses to operate various types of vehicles, depending on their requirements. The document also needs to accompany certain applications for licenses to drive transport vehicles. Its primary function is to ensure that individuals meet minimum health standards before obtaining permits to operate different kinds of vehicles. Furthermore, this certification process helps maintain public safety through verification processes involving signatures, seals, photographs, and declarations about personal health conditions. Overall, the main objective behind issuing medical certificates is ensuring roadworthiness and compliance with regulations regarding vehicular operations.


In [None]:
query="What is the purpose of medical certificate?"
answer = generate_answer(query)
print(answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing query for the first time:
The medical certificate serves multiple purposes including being used as part of applications for learner’s license or driver’s licenses, especially when applying for renewals or adding classes of motor vehicles to existing licenses. Additionally, it provides proof of physical fitness for applicants seeking licenses to operate various types of vehicles, depending on their requirements. The document also needs to accompany certain applications for licenses to drive transport vehicles. Its primary function is to ensure that individuals meet minimum health standards before obtaining permits to operate different kinds of vehicles. Furthermore, this certification process helps maintain public safety through verification processes involving signatures, seals, photographs, and declarations about personal health conditions. Overall, the main objective behind issuing medical certificates is ensuring roadworthiness and compliance with regulations regarding ve

# **Initialize Flask App and Expose with Ngrok.**

In [None]:
from pyngrok import ngrok
from flask import Flask, request, jsonify
from flask_cors import CORS

ngrok.set_auth_token(" ") #Replace with you ngrok authorization token



In [None]:
# Initialize the Flask app
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})
@app.route('/chat', methods=['POST'])
def chat():
    # Get the query from the frontend
    data = request.get_json()
    query = data.get("message", "")
    try:
        answer = generate_answer_with_cache(query)
        print(f"Generated answer: {answer}")
    except Exception as e:
        return jsonify({"error": str(e)}), 500

    return jsonify({"response": answer})
if __name__ == '__main__':
    # Expose the Flask app using ngrok
    public_url = ngrok.connect(5000)
    print("Public URL:", public_url)
    app.run(port=5000)

Public URL: NgrokTunnel: "https://f710-34-82-77-188.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
