In [23]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline

#### Load Docs (pdf)

In [2]:
doc_loader = DirectoryLoader('./FOMC_docs_2023', glob='**/*.pdf',     # Let's load only pdf files in every subdirectory
    show_progress=True
)
docs = doc_loader.load()

  0%|                                                    | 0/16 [00:00<?, ?it/s]This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpo

In [3]:
len(docs)

16

In [4]:
docs[:1]

[Document(page_content='FEDERAL RESERVE press release\n\nFor release at 2:00 p.m. EDT                                             July 26, 2023  \n\nRecent indicators suggest that economic activity has been expanding at a moderate pace.\n\nJob gains have been robust in recent months, and the unemployment rate has remained low.\n\nInflation remains elevated.\n\nThe U.S. banking system is sound and resilient. Tighter credit conditions for households and businesses are likely to weigh on economic activity, hiring, and inflation. The extent of these effects remains uncertain. The Committee remains highly attentive to inflation risks.\n\nThe Committee seeks to achieve maximum employment and inflation at the rate of 2 percent over the longer run. In support of these goals, the Committee decided to raise the target range for the federal funds rate to 5-1/4 to 5-1/2 percent. The Committee will continue to assess additional information and its implications for monetary policy. In determining th

#### Split docs

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=300
)
splitted_docs_list = splitter.split_documents(docs)
#splitted_docs_list = splitter.create_documents(splitted_str_list)

In [6]:
splitted_docs_list[0]

Document(page_content='FEDERAL RESERVE press release\n\nFor release at 2:00 p.m. EDT                                             July 26, 2023  \n\nRecent indicators suggest that economic activity has been expanding at a moderate pace.\n\nJob gains have been robust in recent months, and the unemployment rate has remained low.\n\nInflation remains elevated.\n\nThe U.S. banking system is sound and resilient. Tighter credit conditions for households and businesses are likely to weigh on economic activity, hiring, and inflation. The extent of these effects remains uncertain. The Committee remains highly attentive to inflation risks.', metadata={'source': 'FOMC_docs/monetary20230726a1.pdf'})

#### Define the embeddings

In [7]:
# Define the path to the pre-trained model 
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)



In [8]:
documents = splitted_docs_list

texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]

In [9]:
texts[0]

'FEDERAL RESERVE press release\n\nFor release at 2:00 p.m. EDT                                             July 26, 2023  \n\nRecent indicators suggest that economic activity has been expanding at a moderate pace.\n\nJob gains have been robust in recent months, and the unemployment rate has remained low.\n\nInflation remains elevated.\n\nThe U.S. banking system is sound and resilient. Tighter credit conditions for households and businesses are likely to weigh on economic activity, hiring, and inflation. The extent of these effects remains uncertain. The Committee remains highly attentive to inflation risks.'

#### Vector database

In [20]:
#db = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
db = FAISS.from_documents(splitted_docs_list, embeddings)

In [21]:
question = "What is the target inflation objective in December 2023?"
searchDocs = db.similarity_search(question)
#print(searchDocs[0].page_content)
print(searchDocs)

[Document(page_content='recession toward the end of the year. However, the staff continued to expect that real GDP growth in 2024 and 2025 would run below their estimate of potential output growth, leading to a small increase in the unemployment rate relative to its current level.\n\nThe staff continued to project that total and core PCE price inflation would move lower in coming years. Much of the step-down in core inflation was expected to occur over the second half of 2023, with forward-looking indi- cators pointing to a slowing in the rate of increase of housing services prices and with core nonhousing ser- vices prices and core goods prices expected to decelerate over the remainder of 2023. Inflation was anticipated to ease further over 2024 as demand–supply imbalances continued to resolve; by 2025, total PCE price inflation was expected to be 2.2 percent, and core inflation was expected to be 2.3 percent.', metadata={'source': 'FOMC_docs/fomcminutes20230726.pdf'}), Document(page_

#### LLM pipeline

In [32]:
# Specify the model name
#model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
#tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
#tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline("question-answering")

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    #model_kwargs={"temperature": 0.7, "max_length": 512},
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

#### Retriever

In [33]:
# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=db.as_retriever(), 
    return_source_documents=False)

In [34]:
query = "How was core pce inflation's trend in 2023?"
retrieval_qa.run(query)


ValueError: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

markedly this year and then to track core inflation over the following two years. In 2024 and 2025, both total and core PCE price inflation were expected to be near 2 percent.

recession toward the end of the year. However, the staff continued to expect that real GDP growth in 2024 and 2025 would run below their estimate of potential output growth, leading to a small increase in the unemployment rate relative to its current level.

The staff continued to project that total and core PCE price inflation would move lower in coming years. Much of the step-down in core inflation was expected to occur over the second half of 2023, with forward-looking indi- cators pointing to a slowing in the rate of increase of housing services prices and with core nonhousing ser- vices prices and core goods prices expected to decelerate over the remainder of 2023. Inflation was anticipated to ease further over 2024 as demand–supply imbalances continued to resolve; by 2025, total PCE price inflation was expected to be 2.2 percent, and core inflation was expected to be 2.3 percent.

core inflation was forecast to slow through next year but remain moderately above 2 percent. With expected declines in consumer energy prices and a substantial moderation in food price inflation, total inflation was projected to run below core inflation this year and next. In 2025, both total and core PCE price inflation were expected to be at about 2 percent.

Total PCE price inflation was expected to be close to 3.0 percent by the end of this year, and core PCE infla- tion was expected to be around 3.5 percent. Inflation was projected to move lower in coming years as demand and supply in product and labor markets moved into better alignment; in 2026, total and core PCE price in- flation rates were expected to be close to 2 percent.

Question: How was core pce inflation's trend in 2023?
Helpful Answer: argument needs to be of type (SquadExample, dict)

##### Ref: https://medium.com/@s.rashwand/how-to-build-a-chatbot-smarter-than-chatgpt-quickly-using-langchain-and-weaviate-f6309cc86e09
https://medium.com/@akriti.upadhyay/implementing-rag-with-langchain-and-hugging-face-28e3ea66c5f7