In [2]:
import openai
import os
import sys
sys.path.append("../doc")

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [8]:
pdf_folder_path = "docs/gnm_reports"

In [9]:
os.listdir(pdf_folder_path)

['global_market_analysis_may23.pdf',
 'global_market_analysis_apr23.pdf',
 'global_market_analysis_mar23.pdf']

In [14]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

In [15]:
loaders

[<langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x11fc6de20>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x11fc6dc70>,
 <langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x11fc6ddc0>]

In [16]:
index = VectorstoreIndexCreator().from_loaders(loaders)
index

[nltk_data] Downloading package punkt to /Users/yeo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yeo/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x11fc600a0>)

In [36]:
x = loaders[0].load()

In [42]:
x[0].page_content[2500:3500]

'ayments (CDR)\n\n4.3 Voluntary Prepayment Rates (CRR) .....................................................................13 13 4.3 Voluntary Prepayment Rates (CRR)\n\n5 5\n\n14 Single-Family MBS Pass-Through Issuance .............................................................14 Single-Family MBS Pass-Through Issuance\n\n5.1 Gross Issuance of Agency MBS ........................................................................... 5.1 Gross Issuance of Agency MBS\n\n14\n\n...............................................................................\n\n5.2 Net Issuance of Agency MBS\n\n16\n\n...............................................................................\n\n5.3 Monthly Issuance Breakdown\n\n18\n\n..............................................................\n\n5.4 Percent Refi at Issuance – Single-Family\n\n19\n\n6\n\n20\n\n...................................................................\n\nAgency Single-Family MBS Outstanding\n\n..................................

In [20]:
index.query("What is the key difference in refinancing between Mar 2023 and May 2023?")

" In March 2023, Freddie Mac's refinance percentage rose to 13.8%, Fannie Mae's refinance percentage rose to 17.0%, Ginnie Mae's refinance percentage rose to 15.0%, FHA's refinance percentage rose to 14.3%, and VA's refinance percentage rose to 16.9%. In May 2023, Freddie Mac's refinance percentage dropped to 12.8%, Fannie Mae's refinance percentage dropped to 14.0%, Ginnie Mae's refinance percentage rose to 16.1%, FHA's refinance percentage rose to 15.6%, and VA's refinance percentage rose to 17.6%."

In [21]:
index.query_with_sources ("What is the key difference in refinancing between Mar 2023 and May 2023?")

{'question': 'What is the key difference in refinancing between Mar 2023 and May 2023?',
 'answer': ' The key difference in refinancing between March 2023 and May 2023 is that the average 30-year fixed rate mortgage rate increased by 7 bps and the average 15-year fixed rate mortgage rate increased by 11 bps.\n',
 'sources': 'docs/gnm_reports/global_market_analysis_mar23.pdf, docs/gnm_reports/global_market_analysis_apr23.pdf, docs/gnm_reports/global_market_analysis_may23.pdf'}

In [32]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

In [33]:
index_custom = VectorstoreIndexCreator(
    vectorstore_cls=Chroma,
    embedding=OpenAIEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
).from_loaders(loaders)

Created a chunk of size 1092, which is longer than the specified 1000
Created a chunk of size 1096, which is longer than the specified 1000
Created a chunk of size 1104, which is longer than the specified 1000


In [34]:
index.query_with_sources ("What is the key difference in refinancing between Mar 2023 and May 2023?")

{'question': 'What is the key difference in refinancing between Mar 2023 and May 2023?',
 'answer': ' The key difference in refinancing between March 2023 and May 2023 is that the average 30-year fixed rate mortgage rate increased by 7 bps and the average 15-year fixed rate mortgage rate increased by 11 bps.\n',
 'sources': 'docs/gnm_reports/global_market_analysis_mar23.pdf, docs/gnm_reports/global_market_analysis_apr23.pdf, docs/gnm_reports/global_market_analysis_may23.pdf'}

In [45]:
index.query_with_sources ("What is the key benefit of investing in MBS?")

{'question': 'What is the key benefit of investing in MBS?',
 'answer': ' The key benefit of investing in MBS is that they allow investors with different investment horizons, risk-reward preferences, and asset-liability management requirements to purchase MBS tailored to their needs.\n',
 'sources': 'https://www.ginniemae.gov/about_us/what_we_do/pages/programs_products.aspx'}