In [1]:
# Import required packages
import os
import shutil
import pandas as pd
from dotenv import load_dotenv

# Chunking documents
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter

# # Building vector db using FAISS
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from huggingface_hub import login
from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace
from langchain.vectorstores import FAISS

# Prepare the Data

In [4]:
# Paths
ABSOLUTE_PATH = os.path.abspath(os.getcwd())
DATA_PATH = os.path.join(ABSOLUTE_PATH, "data")
DB_PATH = os.path.join(ABSOLUTE_PATH, "vector_dbs", "openai")
DB_HF_PATH = os.path.join(ABSOLUTE_PATH, "vector_dbs", "hugging_face")

In [28]:
# Load data
file_path = os.path.join(DATA_PATH, 'FAQ_Nawa.xlsx')
faq_df = pd.read_excel(file_path, index_col=None)

raw_texts = []

for _, faq in faq_df.iterrows():
    document = str(faq['Question']) + " " + str(faq['Answer'])
    raw_texts.append(document)

# Convert raw text to Document objects
all_documents = [Document(page_content=text, metadata={"source": f"doc_{i}"}) for i, text in enumerate(raw_texts)]
all_documents

[Document(metadata={'source': 'doc_0'}, page_content='Apa itu nawatech? Nawatech, perusahaan pengembangan perangkat lunak yang siap membantu mengembangkan bisnis Anda dengan solusi teknologi.'),
 Document(metadata={'source': 'doc_1'}, page_content='Siapa CEO Nawatech? CEO atau direktur dari perusahaan nawatech adalah arfan arlanda.'),
 Document(metadata={'source': 'doc_2'}, page_content='Dimana saya bisa mengkontak Nawatech? Untuk mengkontak nawatech anda bisa klik link dibawah ini https://www.nawatech.co/contact-us'),
 Document(metadata={'source': 'doc_3'}, page_content='Layanan apa saja yang diberikan nawatech? Layanan Terkelola, Layanan konsulasi, Inovasi, Integrasi sistem '),
 Document(metadata={'source': 'doc_4'}, page_content='Siapa Nawatech ? Kami adalah kumpulan talenta yang bersemangat yang mengkhususkan diri dalam menciptakan solusi inovatif dan berdampak bagi bisnis.'),
 Document(metadata={'source': 'doc_5'}, page_content='Layanan Kami Layanan kami dirancang untuk membantu b

# Building Vector Database using OpenAI

In [29]:
# Load OpenAI API Key
load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "type-your-api-key-here")

db_name = DB_PATH

# Create FAISS vector store using OpenAI API
openai_embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(
    documents=all_documents,
    embedding=openai_embeddings
)

# Save the vectorstore manually
vectorstore.save_local(DB_PATH)

# View the vectorstore
total_vectors = vectorstore.index.ntotal
dimensions = vectorstore.index.d
print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in this FAISS vector store")

There are 11 vectors with 1,536 dimensions in this FAISS vector store


# Building Vector Database using OpenAI

In [30]:
# Load and login to hugging face
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "type-your-token-here")
login(HF_TOKEN)

db_name = DB_HF_PATH

# Create FAISS vector store using Hugging Face model
hf_embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")
hf_vectorstore = FAISS.from_documents(
    documents=all_documents,
    embedding=hf_embeddings
)

# Save the vector to local storage
hf_vectorstore.save_local(DB_HF_PATH)

# View the vectorstore
total_vectors = hf_vectorstore.index.ntotal
dimensions = hf_vectorstore.index.d
print(f"There are {total_vectors} vectors with {dimensions:,} dimensions in this HF FAISS vector store")

There are 11 vectors with 384 dimensions in this HF FAISS vector store
