In [3]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

#Loader
loader = PyMuPDFLoader("data\Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = text_splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "Dont make up the answer if you dont know. Question: What are Huy Bui's top skills?"
response = qa_chain.invoke(query)

print("Response", response['result'])
print("############################")
for i, doc in enumerate(response['source_documents']):
    print(f"######DOC {i+1}######")
    print(doc)

Response Based on the provided context, Huy Bui's top skills appear to be in **data science and programming**. Specifically, his skills include:

- **Programming languages**: Python, SQL, JavaScript, and React.
- **Data science tools**: Pandas, Numpy, Scikit-learn, BigQuery, and LLM.
- **Cloud services**: AWS (S3, Redshift), OpenAI API.
- **CI/CD tools**: TeamCity, GitHub.
- **Mathematics**: Strong background in mathematics, including calculus, differential equations, and discrete mathematics.

However, if you are referring to a different Huy Bui, such as the artist or the educator, their skills would be different and are not detailed in the provided context.
############################
######DOC 1######
page_content='Huy Bui
williamhuybui@gmail.com | linkedin.com/in/huy-bui-ds
Experience
Publicis Groupe
Remote
Senior Data Scientist
June 2022 – Present
– Developed 7 client-agnostic applications using Python and Dash, empowering 100+ analysts to streamline data
analysis and reporting, 

In [4]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
from langchain.retrievers import MultiQueryRetriever
from langchain_community.llms import Ollama
import os

#Loader
loader = PyMuPDFLoader("data\Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = text_splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")


#More complex multi-query retriever and answer provider
#Retriever
qretriever_llm = Ollama(model="llama3")
retriever = MultiQueryRetriever.from_llm(
    retriever=vector_store.as_retriever(), 
    llm=qretriever_llm
)


#Perplexity's LLM
perplexity_llm = ChatPerplexity(
    model="sonar-pro",
    pplx_api_key = "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",
    temperature=0.2
)
qa_chain = RetrievalQA.from_chain_type(
    perplexity_llm,
    retriever=retriever,
    chain_type="stuff"
)

query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "Dont make up the answer if you dont know. Question: What are Huy Bui's top skills?"
response = qa_chain.invoke(query)

print("Response", response['result'])
print("############################")
for i, doc in enumerate(response['source_documents']):
    print(f"######DOC {i+1}######")
    print(doc)

Response ## Huy Bui's Top Skills

Based on the provided information, Huy Bui's top skills include:

- **Data Science & Analytics**
  - Extensive experience in developing client-agnostic applications for data analysis and reporting using Python and Dash.
  - Expertise in building automated data pipelines with Python, SQL, BigQuery, AWS Lambda, S3, and EventBridge.
  - Proficient in developing regression models and leveraging large language models (LLMs) for extracting insights and optimizing marketing spend.

- **Programming & Technical Skills**
  - Advanced proficiency in Python (including Pandas, Numpy, Scikit-learn), SQL, and JavaScript.
  - Skilled in front-end development with React and enhancing UI/UX by migrating Python codebases to React.
  - Experience with cloud platforms and tools such as AWS (S3, Redshift), BigQuery, and REST APIs.
  - Familiarity with CI/CD tools like TeamCity and GitHub.

- **Machine Learning & Modeling**
  - Developed time series and hybrid physics models

KeyError: 'source_documents'