## Document Ingestion

In [1]:
import os
import warnings
from dotenv import load_dotenv

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'  # if using chroma nd faiss in same environment
warnings.filterwarnings("ignore")

load_dotenv()

True

In [2]:
os.environ['LANGCHAIN_PROJECT']

'chatmypdf'

In [3]:
from docling.document_converter import DocumentConverter
from pathlib import Path
source = Path("/home/tadmnit/AI_Team/Anjit/AI_chat_bot_demo_FAQ/Smart_Business_IoT_Pdf.pdf")

In [4]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import (
    DocumentConverter,
    PdfFormatOption,
    WordFormatOption,
)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend

In [5]:
converter = DocumentConverter(
    allowed_formats=[InputFormat.PDF],  # Or other formats you need
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_cls=StandardPdfPipeline,
            backend=PyPdfiumDocumentBackend
        )
    }
)

In [6]:
conv_result = converter.convert(source)

In [7]:
from langchain_core.documents import Document as LCDocument
if conv_result and conv_result.document:
    lc_docs = LCDocument(page_content=conv_result.document.export_to_markdown())
    print("Successfully created LCDocument")
else:
    print("Conversion failed or produced no document")

# Now you can use lc_doc
print(lc_docs)

Successfully created LCDocument
page_content='## Introduction

The Smart Business IoT application allows the user to view and manage the data of various sensors installed on different sites. The user can monitor data focusing on light intensity, temperature, humidity, and carbon dioxide levels through installed devices. The application connects multiple sites and acts as a single data point. The user can add events, set event rules, generate reports based on the collected data, and manage administration through the application.

## Login

The login page enables the user to access the Smart Business IoT application using the credentials. Once logged in, the user can navigate the application and explore the features.

Perform the following steps to log in to Smart Business IoT:

1. Launch any web browser and enter the application's URL to log in. The login page appears.
2. Enter the Email address and Password in the respective fields.
3. Click Login .

<!-- image -->

<!-- image -->

## 

In [8]:
lc_docs

Document(metadata={}, page_content='## Introduction\n\nThe Smart Business IoT application allows the user to view and manage the data of various sensors installed on different sites. The user can monitor data focusing on light intensity, temperature, humidity, and carbon dioxide levels through installed devices. The application connects multiple sites and acts as a single data point. The user can add events, set event rules, generate reports based on the collected data, and manage administration through the application.\n\n## Login\n\nThe login page enables the user to access the Smart Business IoT application using the credentials. Once logged in, the user can navigate the application and explore the features.\n\nPerform the following steps to log in to Smart Business IoT:\n\n1. Launch any web browser and enter the application\'s URL to log in. The login page appears.\n2. Enter the Email address and Password in the respective fields.\n3. Click Login .\n\n<!-- image -->\n\n<!-- image -

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks = text_splitter.split_documents([lc_docs])

In [10]:
chunks

[Document(metadata={}, page_content="## Introduction\n\nThe Smart Business IoT application allows the user to view and manage the data of various sensors installed on different sites. The user can monitor data focusing on light intensity, temperature, humidity, and carbon dioxide levels through installed devices. The application connects multiple sites and acts as a single data point. The user can add events, set event rules, generate reports based on the collected data, and manage administration through the application.\n\n## Login\n\nThe login page enables the user to access the Smart Business IoT application using the credentials. Once logged in, the user can navigate the application and explore the features.\n\nPerform the following steps to log in to Smart Business IoT:\n\n1. Launch any web browser and enter the application's URL to log in. The login page appears.\n2. Enter the Email address and Password in the respective fields.\n3. Click Login .\n\n<!-- image -->\n\n<!-- image -

In [None]:
# #to know no of token
# import tiktoken
# encoding = tiktoken.encoding_for_model("gpt-4o-mini")


ModuleNotFoundError: No module named 'tiktoken'

### Document Vector Embedding

In [11]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [12]:
embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")

single_vector = embeddings.embed_query("this is some text data my name is thinkpalm")

In [13]:
len(single_vector)

768

In [20]:
index = faiss.IndexFlatL2(len(single_vector))
index.ntotal, index.d


(0, 768)

In [21]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [22]:

# store documents
ids = vector_store.add_documents(documents=chunks)

In [23]:
vector_store.index_to_docstore_id
len(ids)

61

In [27]:
db_name = "smart_iot_vector_store"
vector_store.save_local(db_name)

new_vector_store = FAISS.load_local(db_name, embeddings=embeddings, allow_dangerous_deserialization=True)
len(new_vector_store.index_to_docstore_id)

61

### Retrieval

In [9]:
question = "where can i see the status of the devices installed?"
docs = new_vector_store.search(query=question, search_type='similarity')
for doc in docs:
    print(doc.page_content)
    print("\n\n")

The Devices page displays the list of available devices with the following details:

- · Name: Displays the name of the device.
- · Status/Last Value: Displays the current value from the device.
- · Device Type: Displays the type of the device.
- · Battery: Displays the battery strength of the device.
- · Signal: Displays the signal strength of the device.
- · Location: Displays the site where the device is installed.
- · Last Seen: Displays the date and time when the device is last connected.
- · Action: Displays the actions available for the device. The actions are:
- · Edit

<!-- image -->

- · Delete

<!-- image -->

Click to view the graphical representation of the device history. Hover over the graph to see the device status at a specific date and time.

<!-- image -->

## Device Dashboard

The device dashboard provides detailed information about the device. Click on the device name to view the dashboard.

<!-- image -->

The device dashboard displays the following information:



In [10]:
# converting the above cell as retriever
retriever = new_vector_store.as_retriever(search_type="mmr", search_kwargs = {'k': 3, 
                                                                          'fetch_k': 100,
                                                                          'lambda_mult': 1})


In [11]:
docs = retriever.invoke(question)

for doc in docs:
    print(doc.page_content)
    print("\n\n")

The Devices page displays the list of available devices with the following details:

- · Name: Displays the name of the device.
- · Status/Last Value: Displays the current value from the device.
- · Device Type: Displays the type of the device.
- · Battery: Displays the battery strength of the device.
- · Signal: Displays the signal strength of the device.
- · Location: Displays the site where the device is installed.
- · Last Seen: Displays the date and time when the device is last connected.
- · Action: Displays the actions available for the device. The actions are:
- · Edit

<!-- image -->

- · Delete

<!-- image -->

Click to view the graphical representation of the device history. Hover over the graph to see the device status at a specific date and time.

<!-- image -->

## Device Dashboard

The device dashboard provides detailed information about the device. Click on the device name to view the dashboard.

<!-- image -->

The device dashboard displays the following information:



### Generation with LLAMA3.2 1b on OLLAMA

In [12]:
from langchain import hub #TO PULL RAG PROMPTS
from langchain_core.output_parsers import StrOutputParser # GETTING FINAL OUT AS STRING
from langchain_core.runnables import RunnablePassthrough #parse question and context directly to LLM
from langchain_core.prompts import ChatPromptTemplate #to pass prompt with context (chunk of data)

from langchain_ollama import ChatOllama
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [27]:
prompt = """
You are a chatbot assistant for Smart Business IoT, designed to provide precise and accurate answers strictly based on the provided context. 
You have access to ingested document related to Smart Business IoT, an application that allows the user to view and manage the data of
various sensors installed on different sites. The user can monitor data focusing on light
intensity, temperature, humidity, and carbon dioxide levels through installed devices.
The application connects multiple sites and acts as a single data point. The user can
add events, set event rules, generate reports based on the collected data, and manage
administration through the application.
### Guidelines:
1. Provide answers **strictly** based on the given context.  
- If the answer is **not available**, respond with: "I'm sorry, I don't have that information."
2. Ensure responses are **clear, concise, and directly relevant** to the question.
3. **Do not** answer questions outside the scope of the provided context.

Question: {question}  
Context: {context}  
Answer:
"""

In [42]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])
# print(format_docs(docs))

model = ChatOllama(
    model="llama3.2:1b",
    base_url="http://localhost:11434",
    streaming=True,  # Enable streaming
    callbacks=[StreamingStdOutCallbackHandler()],  # Print tokens as they are generated
)



In [28]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])
# print(format_docs(docs))

model = ChatOllama(
    model="phi4:latest",
    base_url="http://192.168.0.49:2255",
    streaming=True,  # Enable streaming
    callbacks=[StreamingStdOutCallbackHandler()],  # Print tokens as they are generated
)


In [22]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
load_dotenv()
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])
# print(format_docs(docs))

model = ChatGroq(
            groq_api_key=os.getenv("GROQ_API_KEY"),
            model="llama3-70b-8192",
            streaming=True,
            callbacks=[StreamingStdOutCallbackHandler()],  # Print tokens as they are generated
            # model="qwen-qwq-32b"
        )


In [29]:
prompt = ChatPromptTemplate.from_template(prompt)
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [30]:
question = "where can i see the status of the devices installed?"
output = rag_chain.invoke(question)

KeyboardInterrupt: 