# 💰 TaxBot AI – FAISS Vectorstore Generator

This Kaggle notebook generates a FAISS vectorstore using LangChain and HuggingFace embeddings from uploaded PDF files (Indian Income Tax documents).

In [2]:
# ✅ Install required libraries with safe upgrade
!pip install --upgrade langchain langchain-core langchain-community faiss-cpu sentence-transformers

Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith>=0.1.17 (from langchain)
  Downloading langsmith-0.4.4-py3-none-any.whl.metadata (15 kB)
Collecting packaging<25,>=23.2 (from langchain-core)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-

In [3]:
# ✅ Vectorstore Generation Script
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
import os

pdf_dir = "/kaggle/input/incometax"
all_docs = []

# 🔄 Load all PDFs
for file in os.listdir(pdf_dir):
    if file.endswith(".pdf"):
        print(f"🔍 Loading {file} ...")
        loader = PyPDFLoader(os.path.join(pdf_dir, file))
        all_docs.extend(loader.load())

# ✂️ Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(all_docs)
print(f"✅ Total chunks created: {len(chunks)}")

# 🧠 Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 🗂️ FAISS vectorstore creation
vectorstore = FAISS.from_documents(chunks, embedding_model)
vectorstore.save_local("vectorstore/income_tax_faiss")
print("✅ FAISS vectorstore saved at vectorstore/income_tax_faiss")

🔍 Loading income-tax-act-1961-as-amended-by-finance-act-2025 (1).pdf ...
🔍 Loading a1961-43.pdf ...
🔍 Loading Income Tax Rules1962(All Amendmnets of 2024) (1).pdf ...
✅ Total chunks created: 15108


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
2025-07-07 14:50:41.001861: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751899841.288966      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751899841.371557      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ FAISS vectorstore saved at vectorstore/income_tax_faiss


### 📦 Tip: Zip the FAISS folder to download and use in your Vercel project

In [4]:
import shutil
shutil.make_archive("income_tax_faiss", 'zip', "vectorstore/income_tax_faiss")
print("📦 Zipped vectorstore ready to download!")

📦 Zipped vectorstore ready to download!
