### Model 1

In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

#Loader
loader = PyMuPDFLoader("data\Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = text_splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "Dont make up the answer if you dont know. Question: What are Huy Bui's top skills?"
response = qa_chain.invoke(query)

print("Response", response['result'])
print("############################")
for i, doc in enumerate(response['source_documents']):
    print(f"######DOC {i+1}######")
    print(doc)

  hf = HuggingFaceEmbeddings(
  llm = ChatPerplexity(


Response Based on the provided information, Huy Bui's top skills appear to be in **data science and programming**, particularly in:

- **Python programming**: Utilizing libraries like Pandas, Numpy, and Scikit-learn.
- **Data analysis and modeling**: Developing regression models and leveraging machine learning techniques.
- **Cloud computing**: Proficient in AWS services such as S3 and Lambda.
- **Database management**: Skilled in SQL and BigQuery.
- **Web development**: Experienced with React and JavaScript for UI/UX enhancements.
- **Leadership and team management**: Proven ability to lead cross-functional teams in fast-paced environments.

However, if you are referring to a different Huy Bui, such as the professor at Parsons or the barber, their skills would be different and not detailed in the provided context.
############################
######DOC 1######
page_content='Huy Bui
williamhuybui@gmail.com | linkedin.com/in/huy-bui-ds
Experience
Publicis Groupe
Remote
Senior Data Scien

In [2]:
pip install chromadb

^C
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Lan Dao\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


Collecting chromadb
  Downloading chromadb-1.0.10-cp39-abi3-win_amd64.whl (19.0 MB)
     --------------------------------------- 19.0/19.0 MB 16.4 MB/s eta 0:00:00
Collecting uvicorn[standard]>=0.18.3
  Downloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
     ---------------------------------------- 62.5/62.5 KB ? eta 0:00:00
Collecting opentelemetry-instrumentation-fastapi>=0.41b0
  Downloading opentelemetry_instrumentation_fastapi-0.54b1-py3-none-any.whl (12 kB)
Collecting overrides>=7.3.1
  Downloading overrides-7.7.0-py3-none-any.whl (17 kB)
Collecting importlib-resources
  Downloading importlib_resources-6.5.2-py3-none-any.whl (37 kB)
Collecting build>=1.0.3
  Downloading build-1.2.2.post1-py3-none-any.whl (22 kB)
Collecting opentelemetry-api>=1.2.0
  Downloading opentelemetry_api-1.33.1-py3-none-any.whl (65 kB)
     ---------------------------------------- 65.8/65.8 KB ? eta 0:00:00
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0
  Downloading opentelemetry_exporter_otlp

### Model 2

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.retrievers import MultiQueryRetriever
from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatPerplexity
import os
import chromadb

# 1. Document Loading
loader = PyPDFLoader("finance_data/amazon.pdf")
documents = loader.load()

# 2. Text Splitting
text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = text_splitter.split_documents(documents)

# 3. Embeddings
model_name = "BAAI/bge-base-en"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# 4. Vector Store (Should replace with other model - Chromadb)
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=hf,
    persist_directory= "chroma_index_finance",
)
vector_store.persist()

# 5. Prompt Template (Need to fix as finance advisor)
prompt_template = """
You are a professional financial advisor with expertise in corporate finance, investment analysis, and career development in finance-related roles.

Use only the information provided in the context to answer the user's question. Do not make assumptions or fabricate any details.

Respond clearly and professionally, as if advising a client on their financial career or investment decisions.
{context}

Question: {question}
If the answer is not explicitly stated in the context, respond with: "I don't know based on the provided document"."""
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# 6. Retrieval Setup
base_retriever = vector_store.as_retriever(search_kwargs={"k": 5})
qretriever_llm = Ollama(model="llama3")

# Perplexity LLM
perplexity_llm = ChatPerplexity(
    model="sonar",  
    pplx_api_key="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",
    temperature=0.2
)

# Contextual compression (using multi_retriever or base_retriever)
compressor = LLMChainExtractor.from_llm(perplexity_llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

# 7. QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=perplexity_llm,
    chain_type="stuff",
    retriever=compression_retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

# Test queries
queries = [
    "What process does Amazon use to determine whether a tax position is more likely than not to be sustained for its financial reporting?",
    "How does Netflix's revenue recognition policy for monthly membership fees impact its financial statements throughout the year?"
]

for query in queries:
    print(f"\nQuery: {query}")
    try:
        response = qa_chain({"query": query})  
        print("Answer:", response['result'])
        for i, doc in enumerate(response['source_documents']):
            print(f"\nSource {i+1}:")
            print(doc.page_content)
    except Exception as e:
        print(f"Error processing query: {e}")

  vector_store.persist()



Query: What process does Amazon use to determine whether a tax position is more likely than not to be sustained for its financial reporting?
Answer: I don't know based on the provided document.

Source 1:
The Company uses significant judgment in (1) determining whether a tax position’s technical merits are more likely than not to be sustained and (2) measuring the amount of tax benefit that qualifies for recognition.  
Management’s evaluation of tax positions is based on interpretations of tax laws and legal rulings, and may be impacted by regulatory changes and judicial and examination activity.

Source 2:
NO_OUTPUT

The provided context does not contain information relevant to Amazon's process for determining whether a tax position is more likely than not to be sustained for its financial reporting. The context primarily discusses Amazon's business operations and segments, while the search results focus on Amazon's tax interview process for sellers and associates, which does not add