In [21]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

#Load
loader = PyMuPDFLoader("data/Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)


In [22]:
query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "Dont make up the answer if you dont know. Question: What are Huy Bui's top skills?"
response = qa_chain.invoke(query)

In [23]:
print("Response", response['result'])
print("############################")
for i, doc in enumerate(response['source_documents']):
    print(f"######DOC {i+1}######")
    print(doc)

Response Based on the provided information, Huy Bui's top skills appear to be in **data science and programming**. Specifically, his skills include:

- **Programming Languages**: Python, JavaScript, SQL, and React.
- **Data Science Tools**: Pandas, Numpy, Scikit-learn, BigQuery, and AWS services like S3 and Lambda.
- **Machine Learning**: Developing regression models and LLM-based procedures.
- **Data Pipelines**: Building automated data pipelines and robust data cleaning processes.
- **Leadership**: Leading cross-functional teams in data science initiatives.

However, it's important to note that there are multiple individuals named Huy Bui, and the context provided seems to refer to a specific Huy Bui who is a data scientist. If you are referring to a different Huy Bui, such as the one involved in plant structures or cryo-electron tomography, their skills would be different.
############################
######DOC 1######
page_content='Huy Bui
williamhuybui@gmail.com | linkedin.com/in/

In [24]:
print ("----------------------------------------------------------------------------------------------------------")

----------------------------------------------------------------------------------------------------------


In [30]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.chat_models import ChatPerplexity
from langchain.chains import RetrievalQA
import os

#Load
loader = PyMuPDFLoader("data/Huy_Bui_Resume.pdf")
documents = loader.load()

#Split
splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
chunks = splitter.split_documents(documents)

#Embbedding
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False} #False Euclidean, True cosine similarity
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

#Vector Store
vector_store = FAISS.from_documents(chunks, hf)
vector_store.save_local("faiss_index_open")

#Retriever
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

#LLM
llm = ChatPerplexity(
    model="sonar-pro",
    pplx_api_key = "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",
    temperature=0.2
)

# QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    return_source_documents=True
)


In [31]:
query = "How many times do the vowels 'ae' appear in 'I am a data analyst and engineer'?"
query = "What are Huy Bui’s certifications from the resume?"
query = "Dont make up the answer if you dont know. Question: What are Huy Bui's top skills?"
response = qa_chain.invoke(query)

In [32]:
print("Response", response['result'])
print("############################")
for i, doc in enumerate(response['source_documents']):
    print(f"######DOC {i+1}######")
    print(doc)

Response Based on the information provided in your personalization data, Huy Bui's top skills include:

**Technical Skills:**
- Data Science and Programming: Python (with libraries like Pandas, Numpy, Scikit-learn)
- SQL
- JavaScript and React
- BigQuery
- Large Language Models (LLM)
- AWS services (S3, Redshift)
- CI/CD tools (TeamCity, GitHub)
- OpenAI API

**Professional Skills:**
- Leading data science initiatives and cross-functional teams
- Building scalable analytics solutions
- Developing automated data pipelines
- Creating regression models
- Implementing LLM-based procedures
- UI/UX enhancement
- Research and publication in specialized fields

Huy has demonstrated these skills through his work at Publicis Groupe where he developed client-agnostic applications and led data science initiatives, and at Enovate Upstream where he developed time series and physics hybrid-models. His educational background in Mathematics from Texas A&M University and the University of Houston-Downto

In [28]:
import os 

In [29]:
os.listdir()

['.DS_Store',
 '.git',
 'data',
 'faiss_index_open',
 'indexing.ipynb',
 'pdf_qa_simple.ipynb']