In [1]:
import os
from langchain.document_loaders import TextLoader, PyPDFLoader, CSVLoader
import pandas as pd
import pathlib
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from dotenv import load_dotenv


In [1]:
def create_vector_db(file_path, db_name="local_vector_db"):
    """
    Create a vector database from a text file
    """
    # Load the document
    loader = TextLoader(file_path)
    print("Loaded document")
    # Determine file type and use appropriate loader
    file_extension = pathlib.Path(file_path).suffix.lower()
    
    if file_extension == '.pdf':
        loader = PyPDFLoader(file_path)
        documents = loader.load()
    elif file_extension == '.txt':
        loader = TextLoader(file_path)
        documents = loader.load()
    elif file_extension in ['.xlsx', '.csv']:
        # For Excel files, convert to CSV first
        if file_extension == '.xlsx':
            df = pd.read_excel(file_path)
            # Create temporary CSV file
            temp_csv = 'temp.csv'
            df.to_csv(temp_csv, index=False)
            file_path = temp_csv
        
        loader = CSVLoader(
            file_path,
            csv_args={
                'delimiter': ',',
                'quotechar': '"',
                'fieldnames': None
            }
        )
        documents = loader.load()
        print("Loader.loaded")
        # Clean up temporary CSV if created
        if file_extension == '.xlsx' and os.path.exists('temp.csv'):
            os.remove('temp.csv')
            print("removed temp csv. ")
    
    documents = loader.load()
    
    # Split text into chunks
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print("chunked")
    
    # Create embeddings (using HuggingFace embeddings for local processing)
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    print("Embedded")
    
    # Create and save the vector store
    vector_store = FAISS.from_documents(chunks, embeddings)
    vector_store.save_local(db_name)
    print("saved vector store")
    
    return vector_store

In [3]:

def load_vector_db(db_name="local_vector_db"):
    """
    Load an existing vector database
    """
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    print("Loaded vectordb.")
    vector_store = FAISS.load_local(db_name, embeddings)
    return vector_store

In [4]:
def query_document(query, vector_store):
    """
    Query the vector store and get response from LLM
    """
    # Load environment variables
    load_dotenv()
    openai_api_key = os.getenv('OPENAI_API_KEY')
    # Initialize OpenAI LLM
    print("sending to llm")
    llm = OpenAI(temperature=0, api_key=openai_api_key)
    
    # Create a retrieval chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3})
    )
    
    print("receiving from llm")
    # Get response
    response = qa_chain.invoke(query)

    # response = qa_chain.run(query)
    return response

In [17]:
pwd

'y:\\WorkLab\\Projects\\chat-llm-db'

In [5]:
def main():
    print("Supported file formats: .pdf, .txt, .xlsx, .csv")
    file_path = input("Enter the path to your document: ")


    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found!")
        return
    try:
        print("Loading vector db")
        vector_store = load_vector_db()
        print("Loaded existing vector database")
    except Exception as e:
        print("fCreating new vector database... {e}")
        vector_store = create_vector_db(file_path)
        print("Vector database created successfully!")

    while True:
        query = input("\nEnter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        response = query_document(query, vector_store)
        print("\nQuery>", response['query'])
        print("\n\t Response> ", response['result'])

if __name__ == "__main__":
    main()

Supported file formats: .pdf, .txt, .xlsx, .csv
Loading vector db


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


Loaded vectordb.
fCreating new vector database... {e}
Loaded document
chunked
Embedded
saved vector store
Vector database created successfully!
sending to llm


  llm = OpenAI(temperature=0, api_key=openai_api_key)


receiving from llm

Query> what is this document about?

	 Response>   This document is about a project to develop a conversational AI assistant that can answer user questions based on a provided knowledge base.
sending to llm
receiving from llm

Query> tell me about it

	 Response>   The project is focused on creating a working prototype using pre-trained LLMs like GPT-4o or LLaMA. It will have a document upload and management feature, as well as a similarity-based search function to retrieve relevant content. The AI will provide a basic web chat interface for simple queries and will display a fallback message when the knowledge base lacks information. The project will be coded in Python with Flask as the web framework and SQLite for data storage. It will only support text inputs and English as the language for responses.
sending to llm
receiving from llm

Query> what are the requirements for it?

	 Response>   The requirements for the system include performance, reliability, usabilit