In [None]:
import os
import logging
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import json
from pathlib import Path
import streamlit as st
from langchain_openai import ChatOpenAI

In [None]:
# Setup logging
logging.basicConfig(level=logging.DEBUG)

# Load environment variables from .env file
load_dotenv()

# Fetch the OpenAI API key from environment variables
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("API key is missing. Please add it to the .env file.")

In [None]:
# Initialize the LLM (GPT-4 model)
llm = ChatOpenAI(model="gpt-4o", temperature=0.3, openai_api_key= api_key)

In [None]:
# Function to get the appropriate loader for the document type
def get_loader(file_path):
    """ Returns the appropriate document loader based on file type """
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".docx"):
        loader = Docx2txtLoader(file_path)
    elif file_path.endswith(".txt") or file_path.endswith(".md"):
        loader = TextLoader(file_path)
    elif file_path.endswith(".json"):
        loader = json(file_path)
    else:
        raise ValueError(f"Document type {file_path} is not supported.")
    
    # Ensure loader returns documents in iterable format
    documents = loader.load() if hasattr(loader, 'load') else []
    print(f"Loaded {len(documents)} documents from {file_path}")  # Log the number of documents loaded
    return documents

In [None]:
def process_documents(file_paths):
    """ Processes documents (PDF, DOCX, TXT, JSON) into vector embeddings. """
    docs = []
    for file_path in file_paths:
        docs.extend(get_loader(file_path))  # Load documents based on file type

    if len(docs) == 0:
        raise ValueError("No valid documents loaded. Please check the files in the docs folder.")
    
    print(f"Loaded {len(docs)} documents in total.")  # Log total number of documents loaded

    # Initialize the embeddings
    embeddings = OpenAIEmbeddings(openai_api_key= api_key)  # Provide OpenAI embeddings

    # Split the documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=7000, chunk_overlap=3000)
    document_chunks = text_splitter.split_documents(docs)
    print(f"Split documents into {len(document_chunks)} chunks.")  # Log number of document chunks

    if len(document_chunks) == 0:
        raise ValueError("No valid document chunks created. Please check your documents.")

    # Create the vector database (Chroma)
    vector_db = Chroma.from_documents(document_chunks, embedding=embeddings)
    return vector_db

In [None]:
# Function to get all file paths in the 'docs' folder
def get_all_document_files(doc_folder="docs"):
    """Returns a list of all files in the specified folder."""
    return [os.path.join(doc_folder, f) for f in os.listdir(doc_folder) if os.path.isfile(os.path.join(doc_folder, f))]

In [None]:
# Correct function to retrieve relevant answers from the vector database
def get_answer_from_query(vector_db, query):
    # Retrieve the most relevant documents based on the query
    retriever = vector_db.as_retriever(search_kwargs={"k": 6})  # k=4 retrieves top 4 relevant results
    relevant_documents = retriever.get_relevant_documents(query)

    # Use the language model to generate an answer based on the relevant documents
    response = llm.predict(f"""
    You are an assistant for ISAT University. The user will ask questions related to the university. 
    Your task is to answer their exact question using the information provided by the following documents.

    The documents provided are:
    {relevant_documents}

    User's Question: {query}

    Based on the documents above, provide a clear and concise answer to the user's question.
    """)

    return response

In [None]:
def chat():
    user_input = st.text_input("You: ", key="input")
    if user_input:
        # Add the user input to the chat history
        st.session_state.messages.append({"role": "user", "content": user_input})
        
        # Get all documents in the "docs" folder
        doc_files = get_all_document_files("docs")  # Get all files in the docs folder

        # Process all the documents and retrieve an answer
        try:
            vector_db = process_documents(doc_files)  # Process all documents in the docs folder
            answer = get_answer_from_query(vector_db, user_input)
            st.session_state.messages.append({"role": "assistant", "content": answer})
        except Exception as e:
            st.session_state.messages.append({"role": "assistant", "content": f"Error: {e}"})

    # Display the chat history
    for message in st.session_state.messages:
        if message["role"] == "user":
            st.markdown(f"**You:** {message['content']}")
        else:
            st.markdown(f"**Assistant:** {message['content']}")