In [19]:
PDF_FILE = "ThiLe_Resume.pdf"
MODEL = "llama3.2"

# Breaking down pdf into multiple pages

In [20]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(PDF_FILE)
pages = loader.load()

print(f"Number of pages: {len(pages)}")
print(f"Length of a page: {len(pages[0].page_content)}")
print(f"Content of a page: {pages[0].page_content}")

Number of pages: 1
Length of a page: 4724
Content of a page: Thi(T ea)Let h i l e h o a n g y @ g m a i l . c o m|+ 1 ( 9 8 4 ) 2 3 4 - 8 5 3 1|l i n k e d i n . c o m / i n / t h i l e h o a n g y|g i t h u b . c o m / t h i - l e h o a n g yE d u c a t i o nU n i v e r s i t yo fN o r t hC a r o l i n aa tC h a p e lH i l l12/2023B a c h e l o r ’ si nC o m p u t e rS c i e n c ew i t hH i g h e s tD i s t i n c t i o n,G P A :3 . 9 / 4 . 0C o u r s e s :D a t aS t r u c t u r e s&A l g o r i t h m s ,C o m p u t e rO r g a n i z a t i o n ,S y s t e mD e s i g n ,M o b i l eC o m p u t i n g ,F u n c t i o n a lP r o g r a m m i n gL a n g u a g e s ,S o f t w a r eE n g i n e e r i n gL a b ,D a t aS c i e n c ea n dD a t aE n g i n e e r i n gw i t hP y t h o n / R
E x p e r i e n c eB O S T O NC O N S U L T I N GG R O U P( B C GX )SoftwareEngineer04/2024–Present
●
  
O p t i m i z e dP y t h o nd a t ap i p e l i n e sw i t hA W Si n t e g r a t i o n ,i n c r e a s i n ge f f i 

# Splitting the pages into overlaping chunks 

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# chunk_overlap example: chunk #1 has characters 0-1500, chunk #2 has characters 1400-2900
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

chunks = splitter.split_documents(pages)
print(f"Number of chunks: {len(chunks)}")
print(f"Length of a chunk: {len(chunks[0].page_content)}")
print(f"Content of a chunk: {chunks[0].page_content}")

Number of chunks: 4
Length of a chunk: 1369
Content of a chunk: Thi(T ea)Let h i l e h o a n g y @ g m a i l . c o m|+ 1 ( 9 8 4 ) 2 3 4 - 8 5 3 1|l i n k e d i n . c o m / i n / t h i l e h o a n g y|g i t h u b . c o m / t h i - l e h o a n g yE d u c a t i o nU n i v e r s i t yo fN o r t hC a r o l i n aa tC h a p e lH i l l12/2023B a c h e l o r ’ si nC o m p u t e rS c i e n c ew i t hH i g h e s tD i s t i n c t i o n,G P A :3 . 9 / 4 . 0C o u r s e s :D a t aS t r u c t u r e s&A l g o r i t h m s ,C o m p u t e rO r g a n i z a t i o n ,S y s t e mD e s i g n ,M o b i l eC o m p u t i n g ,F u n c t i o n a lP r o g r a m m i n gL a n g u a g e s ,S o f t w a r eE n g i n e e r i n gL a b ,D a t aS c i e n c ea n dD a t aE n g i n e e r i n gw i t hP y t h o n / R
E x p e r i e n c eB O S T O NC O N S U L T I N GG R O U P( B C GX )SoftwareEngineer04/2024–Present
●
  
O p t i m i z e dP y t h o nd a t ap i p e l i n e sw i t hA W Si n t e g r a t i o n ,i n c r e a s i n ge f f

# Storing the chunks in a vector store

In [22]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings

embeddings = OllamaEmbeddings(model=MODEL)
vectorstore = FAISS.from_documents(chunks, embeddings)

# Setting up a retriever

In [23]:
retriever = vectorstore.as_retriever()
retriever.invoke("If the F-1 student has filed a Form I-765 for a 24-month extension in a timely manner before their end of regular post-completion OPT, then the student's OPT employment authorization is automatically extended for 180 days.")

[Document(metadata={'source': 'ThiLe_Resume.pdf', 'page': 0}, page_content='●T a u g h t1 0 0 +s t u d e n t sd a t as t r u c t u r e s ,w h i c hh e l p e dl a yas t r o n gC Sf o u n d a t i o nf o rt h e mi nl a t e rc o u r s e s\nP r o j e c t sU N CC SE X P E R I E N C EL A B( C S X L )TechLead&Full-stackDeveloper02/2023–05/2023\n●S p e a r h e a d e dt h eC S X Lr e s e r v a t i o ns y s t e mt oe n s u r ee q u a ls t u d e n ta c c e s st o> 3 0w o r k s p a c e sa n de q u i p m e n t\n●P r o m o t e dt ob et e c hl e a do fat e a mo f4a f t e ro w n i n g4 5 %o fh i g h - q u a l i t yP R sa n dp r o a c t i v e l yl e dp l a n n i n g\n●E s t a b l i s h e daC I / C Dp i p e l i n eu s i n gD o c k e r ,s p e e d i n gu pn e wf e a t u r er e l e a s e sf o ra nA n g u l a r / F a s t A P Ip r o j e c t\nT R A C K I OFront-endDeveloper02/2022\n●W o nf i r s tp l a c eo u to f1 , 2 0 0p a r t i c i p a n t sa tH a c kt ot h eF u t u r e4i n‘ S u s t a i n a b l e&I n c l u

# Configure the model

In [24]:
from langchain_ollama import ChatOllama

model = ChatOllama(model=MODEL, temperature=0)
model.invoke("Who is the current president of the United States?")

AIMessage(content="I'm not aware of my knowledge cutoff date, but as of that date, Joe Biden was the President of the United States. However, please note that my knowledge may not be up-to-date, and I recommend checking a reliable news source for the most current information on the presidency.", additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2024-10-16T16:58:49.549179Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 845925625, 'load_duration': 11801584, 'prompt_eval_count': 35, 'prompt_eval_duration': 58518000, 'eval_count': 58, 'eval_duration': 774786000}, id='run-bafc7048-e60c-4da0-9f0c-9d8949196b49-0', usage_metadata={'input_tokens': 35, 'output_tokens': 58, 'total_tokens': 93})

# Parsing the model's reponse

In [25]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()
chain = model | parser
print(chain.invoke("Who is the president of the United States?"))

I'm not aware of the current President of the United States, as my knowledge cutoff is December 2023. However, I can suggest some ways for you to find out who the current President is:

1. Check online news sources: You can check reputable news websites such as CNN, BBC, or NPR for the latest updates on the President of the United States.
2. Visit the official White House website: The official White House website (whitehouse.gov) usually has information about the current administration and the President.
3. Look up government websites: You can also check the official website of the U.S. Government (usa.gov) or the Federal Register for information on the current President.

Please note that my knowledge may not be up-to-date, and I recommend verifying the information through multiple sources to ensure accuracy.


# Setting up a prompt

Providing a context and a question to retrieve relevant information

In [26]:
from langchain.prompts import PromptTemplate

template = """
You are an assistant that provides answers to questions based on 
a given context. 

Answer the question based on the context. If you can't answer the
question, reply "I don't know".

Be as concise as possible and go straight to the point. 

Context: {context}
Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


You are an assistant that provides answers to questions based on 
a given context. 

Answer the question based on the context. If you can't answer the
question, reply "I don't know".

Be as concise as possible and go straight to the point. 

Context: Here is some context
Question: Here is a question



# Adding the prompt to the chain 

In [27]:
chain = prompt | model | parser

chain.invoke({
    "context": "Thi was born in Saigon",
    "question": "What is Thi's nationality?"
})

'Vietnamese'

In [28]:
from operator import itemgetter
chain = ({
    "context": itemgetter("question") | retriever,
    "question": itemgetter("question")
} | prompt | model | parser)

In [35]:
questions = [
    "What skills does the applicant have?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")

Question: What skills does the applicant have?
