Install needed libraries

In [46]:
!pip install langchain-community faiss-cpu
!pip install sentence-transformers
!pip install streamlit
!pip install langchain_huggingface



Imports needed modules

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings
import re
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from uuid import uuid4
from langchain_core.documents import Document
import streamlit as st


In [48]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from langchain_community.llms import HuggingFacePipeline
from transformers import pipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA


Generate model

In [49]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"

embeddings = HuggingFaceEmbeddings(model_name=model_name)

Generate Faiss Vector store

In [50]:
index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

Opens the text file and cleans up the data

In [51]:
with open("blogs/blog 2.txt", "r", encoding="utf-8") as file:
    text = file.read()

sentences=[s.strip() + '.' for s in re.split(r'\.\s*', text) if s]

chunks = []
current_chunk = []
current_word_count = 0
for sentence in sentences:
       word_count = len(sentence.split())
       if current_word_count + word_count <= 50:
           current_chunk.append(sentence)
           current_word_count += word_count
       else:
           # Commit current chunk and start a new one
           chunks.append(' '.join(current_chunk))
           current_chunk = [sentence]
           current_word_count = word_count

if current_chunk:
    chunks.append(' '.join(current_chunk))

Displays cleaned up data for testing

In [52]:
for i, chunk in enumerate(chunks, 1):
    print(f"--- Chunk {i} ---\n{chunk}\n")

--- Chunk 1 ---
Cars have long been woven into the fabric of everyday life. From their humble beginnings in the 19th century to their high-tech incarnations today, they have revolutionized transportation, reshaped cities, and transformed global economies.

--- Chunk 2 ---
What began as a novelty for the wealthy has evolved into an essential part of daily existence for billions around the world. Cars are no longer just machines; they are symbols of freedom, personal style, and technological progress. The story of the car begins with experimentation.

--- Chunk 3 ---
In the late 1800s, inventors like Karl Benz and Gottlieb Daimler crafted the first practical gasoline-powered vehicles. Karl Benz’s 1886 Patent-Motorwagen is often credited as the birth of the modern automobile.

--- Chunk 4 ---
These early machines were rudimentary, noisy, and difficult to operate, but they laid the foundation for a seismic shift in human mobility. As technology improved, so did public interest and accessib

Converts the list into a Dictionary

In [53]:
documents = [None] * len(chunks)  # Create an empty list of the correct size

for i in range(len(chunks)):
   documents[i] = Document(page_content=chunks[i])


Adds the dictionaries to vector store

In [54]:
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['4c6e1f07-1fbe-4f65-b1a1-db2b5e779749',
 'ab746b08-aa2c-41b2-8c05-cd4017c0a16e',
 'bedbb68c-9403-4b09-8ac6-31a17dde0be6',
 'ccd79968-d777-4c0f-b1a8-f3c82f3d4007',
 '3b78b6aa-8c86-443b-884a-06edaa1f682f',
 'f98ee3f5-321c-41ca-bd77-85fd9f8ed23c',
 'f5d0d563-146a-413d-a624-2abf037edd48',
 '1236b15a-6976-4dbf-971e-afec625c7c76',
 '8d407e57-d51e-4624-8e73-78aa94908f42',
 '279f4ec8-40d9-4d1d-a5a8-f1dfb243f58f',
 'fbcc6daf-dc75-4404-b504-0dbfacff4cc6',
 '7364b9fe-58f7-4c54-bf02-14fa4e3e1378',
 '21a69293-2283-4fe6-bfd8-6dd0b656a3da',
 '3dd59609-513d-4eb2-afc0-aa5890b31e3d',
 '4a50eb1e-6bd0-4c3b-9b9b-9c831b8c311e',
 '63b7d695-9a76-4700-a412-10786b59a3ea',
 'e058fbfa-6977-4953-a7ed-56e241aa5c92',
 'cc1015f7-3d75-4b9f-9651-9e9401fb2c12',
 'f446d65a-5033-4756-95d1-2324d5533422',
 '199f462e-da7a-4d95-a647-1d4ceb1764c6',
 '081dca60-b8fd-4002-bd32-9bf7c4d33731',
 '1db19ab7-de26-4d2f-910d-1012a3e32bde',
 'bbf9b6d4-1007-425c-a1ae-2f4319052bd0']

Saves the vector store

In [55]:
vector_store.save_local("faiss_index_dir")

Loads the vector store

In [56]:
loaded_vector_store = FAISS.load_local(
    "faiss_index_dir",
    embeddings,  allow_dangerous_deserialization=True
)

In [57]:
retriever = loaded_vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [58]:
template="""You are a helpful assistant. Use the context below to answer the user's question.
Context:
{context}

Question:
{question}

Answer:"""

In [63]:
model_version = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_version)
model = AutoModelForCausalLM.from_pretrained(model_version)

# Create a Hugging Face pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7
)
# Wrap the pipeline for LangChain compatibility
llm = HuggingFacePipeline(pipeline=pipe)
# Define the Prompt Template
template = """You are a helpful assistant. Use the context below to answer the user's question.

Context:
{context}

Question:
{question}

Answer:"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
# Define the RAG Chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

Device set to use cpu


Function to get answers

In [64]:
def get_answer(question):
    result = rag_chain({"query": question})
    # Extract the generated answer
    return result["result"].split("Answer:")[1].strip()

Langchain integration

In [65]:
st.set_page_config(page_title="Knowledge Chatbot")

st.title("📚 Your Personal Knowledge Chatbot")

if "history" not in st.session_state:
    st.session_state.history = []


query = st.text_input("Ask something from your content:")
if query.strip():
    response = get_answer(query)
    st.session_state.history.append((query, response))

for q, r in reversed(st.session_state.history):
    st.markdown(f"**You:** {q}")
    st.markdown(f"**Bot:** {r}")

