In [63]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

#Load
loader = PyMuPDFLoader("../sample_data/Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)

query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "What are Huy Bui's top skills?"
response = qa_chain.invoke(query)


In [67]:
print("Response", response['result'])
print("############################")
for doc in response['source_documents']:
    print(doc)

Response Based on the provided context, Huy Bui's top skills appear to be in **data science and programming**. Specifically, his technical skills include:

- **Programming Languages**: Python (with libraries like Pandas, Numpy, Scikit-learn), SQL, JavaScript, and React.
- **Data Platforms**: BigQuery.
- **Cloud Services**: AWS (S3, Redshift).
- **CI/CD Tools**: TeamCity, GitHub.
- **AI and Machine Learning**: OpenAI API, LLM (Large Language Models).

Additionally, he has experience in **teaching and mentoring**, having taught various subjects like Python, Matlab, Calculus, Differential Equations, and Discrete Mathematics, and mentored professionals in data science and software engineering.
############################
page_content='Huy Bui
williamhuybui@gmail.com | linkedin.com/in/huy-bui-ds
Experience
Publicis Groupe
Remote
Senior Data Scientist
June 2022 – Present
– Developed 7 client-agnostic applications using Python and Dash, empowering 100+ analysts to streamline data
analysis an