In [2]:
import os
import openai
import sys
sys.path.append('../..')
import shutil

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [3]:
openai.api_key  = os.environ['OPENAI_API_KEY']

In [4]:
# Load PDF
loaders = [
    PyPDFLoader("input data/pdfwithtable_watersector1.pdf"),
    PyPDFLoader("input data/pdfwithtable_watersector2.pdf"),
    PyPDFLoader("input data/pdfwithtable_energysector1.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [5]:
# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,  #the smaller the more precise (but lose context) 
    chunk_overlap = 200 
)

In [6]:
splits = text_splitter.split_documents(docs)
len(splits)

78

In [7]:
splits[3]

Document(page_content='See important disclosure at the end of this document www.vcsc.com.vn  | VCSC<GO>  February 14, 2023 | 2 \n \nIndustry Update  \nBinh Duong Province set its 2023 GRDP target slightly higher than that of \n2022  \n 2022 GRDP increased 8.3% YoY  as all three main sectors of the economy \nperformed strongly. Specifically, the agricultural sector rose 3.1% YoY, the industrial-construction \nsector rose 8.2% YoY and the services sector rose 9.5% YoY.  \nProvincial leaders set 2023 targets  for GRDP to increase 8.5%-8.7% YoY and the Index of \nIndustrial Production (IIP) to rise 8.9% YoY. Core tasks in 2023 include progressing with a modern \nprovincial road transport infrastructure network and facilitating interregional connections. The focus \nis on the Ring Road No. 3 project, the HCMC  Thu Dau Mot - Chon Thanh  Binh Phuoc \nExpressway, the Bau Bang  Cai Mep (Vung Tau) railway and the Song Than Intersection. \nAn all-time high amount of high public investment in 2023

In [None]:
embedding = OpenAIEmbeddings()

In [12]:
persist_directory = 'docs/chroma/financialgpt'

In [11]:
# # Remove old database files if any
# shutil.rmtree('./docs/chroma', ignore_errors=True)

In [13]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [14]:
vectordb.persist() 

#l∆∞u vectordb