In [5]:

import os
import glob
from dotenv import load_dotenv
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# imports for langchain, plotly and Chroma
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
#import matplotlib.pyplot as plt
#from sklearn.manifold import TSNE
#import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

In [7]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [8]:
# Load environment variables in a file called .env

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [None]:
def add_metadata(doc, doc_type):
    doc.metadata["doc_type"] = doc_type
    return doc

pdf_files = glob.glob("**/*.pdf", recursive=True)
print(f"Found PDF files: {pdf_files}")

documents = []
for pdf_file in pdf_files:
    print(f"Loading: {pdf_file}")
    
    # Extract doc_type from file name or folder name
    file_name = os.path.basename(pdf_file)
    doc_type = file_name.replace('.pdf', '').replace('.PDF', '')
    
    try:
        # Load PDF directly
        loader = PyPDFLoader(pdf_file)
        pdf_docs = loader.load()
        
        # Add metadata to each page
        docs_with_metadata = [add_metadata(doc, doc_type) for doc in pdf_docs]
        documents.extend(docs_with_metadata)
        print(f"  ✓ Loaded {len(pdf_docs)} pages")
    except Exception as e:
        print(f"  ✗ Error loading {pdf_file}: {e}")

print(f"\nTotal documents loaded: {len(documents)}")

if documents:
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)
    print(f"Total number of chunks: {len(chunks)}")
    print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")
else:
    print("No documents were loaded!")
    # Debug info
    print("\nDebug info:")
    print(f"Current directory: {os.getcwd()}")

Found PDF files: ['knowledgebase\\KB\\KB_Alat_dan_Pembuatan_Songket.pdf', 'knowledgebase\\KB\\KB_Jenis_Songket.pdf', 'knowledgebase\\KB\\KB_Perawatan_Songket.pdf', 'knowledgebase\\KB\\KB_Sejarah_Songket.pdf']
Loading: knowledgebase\KB\KB_Alat_dan_Pembuatan_Songket.pdf
  ✓ Loaded 4 pages
Loading: knowledgebase\KB\KB_Jenis_Songket.pdf
  ✓ Loaded 5 pages
Loading: knowledgebase\KB\KB_Perawatan_Songket.pdf
  ✓ Loaded 1 pages
Loading: knowledgebase\KB\KB_Sejarah_Songket.pdf
  ✓ Loaded 2 pages

Total documents loaded: 12
Total number of chunks: 12
Document types found: {'KB_Perawatan_Songket', 'KB_Alat_dan_Pembuatan_Songket', 'KB_Sejarah_Songket', 'KB_Jenis_Songket'}


In [10]:
embeddings = OpenAIEmbeddings()
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 12 documents


In [11]:
collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

There are 12 vectors with 1,536 dimensions in the vector store


In [None]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

In [15]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [16]:
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
yang cukup memadai dan kalau perlu disertai dengan alat penerangan lain 
(listrik). Bahkan pada masa kesultanan menenun dilakukan pada sebuah ruang 
khusus pada rumah limas. 
Motif baru biasanya dirancang oleh orang -orang ahli seni, tangan kreatif 
membentuk motif-motif baru. Namun hingga saat ini motif yang ada masih belum 
banyak perkembangan, artinya motif sekarang dasarnya masih motif lama tetapi 
sudah ditambah dengan bentuk lain sebagai hiasan, sehingga  terlihat seperti 
motif baru juga. Motif baru yang dirancang biasanya oleh pemotif dibuat diatas 
kertas milimeter, agar mudah pengrajin mengikuti seper