In [1]:
# Importing Libraries
import os
import pandas as pd

# OpenAI LLM
from langchain_openai import ChatOpenAI

# Document Loading
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Environment vaiables
from dotenv import load_dotenv

In [2]:
# Load your API key from a .env file
load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Loading PDF
pdf_path = 'Resources/Resume - Tin Pham - Dec 2024.pdf'

loader = PyPDFLoader(pdf_path)
pages = loader.load()

Ignoring wrong pointing object 118 0 (offset 0)


In [4]:
# Split the text into management chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, # This ensures each chunk is within the token limit for GPT models
    chunk_overlap = 50
)

chunks = splitter.split_documents(pages)

In [5]:
# Convert chunks into a Pandas Dataframe
df_chunks = pd.DataFrame([{
    'chunk_id': i,
    'content': chunk.page_content.strip()
} for i, chunk in enumerate(chunks)])

df_chunks.head()

Unnamed: 0,chunk_id,content
0,0,"617 Longhorn Cavern Rd., Leander, TX 786..."
1,1,▪ \n \nCollaborated\n \nextensively\n \nwith\n...
2,2,July 2019 – February 2022 Goog...
3,3,processes\n \nwith\n the extended Sales tea...
4,4,accounts\n \nin\n \nSouth\n \nCentral\n \nfor\...


In [6]:
# Saving DF to CSV
df_chunks.to_csv('pdf_chunks.csv', index=False)

In [7]:
# Initialize the Chat Model
llm = ChatOpenAI(openai_api_key = openai_key, temperature = 0)

In [None]:
# Import and Create a Vector Store for Retrieval
# This gives us a searchable knowledge base made from your resume or report.
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Create text list
texts = [chunk.page_content for chunk in chunks]

# Create embeddings and retriever
embedding_model = OpenAIEmbeddings(openai_api_key = openai_key)
vectorstore = FAISS.from_texts(texts, embedding_model)
retriever = vectorstore.as_retriever()

In [9]:
# Set Up Retrieval-Based QA Chain
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    retriever = retriever,
    chain_type = 'stuff'
)

In [13]:
# Ask a Question
response = qa_chain.invoke("Can you give me a timeline of Tin's experience?")
print(response)

{'query': "Can you give me a timeline of Tin's experience?", 'result': "Sure, here is a timeline of Tin Pham's experience based on the provided context:\n\n1. June 2012 – September 2012: Sales Consultant Extraordinaire at WP Engine in Austin, TX.\n2. September 2012 – June 2013: Manager, Customer Experience at WP Engine in Austin, TX.\n3. February 2022 – November 2024: Strategic Account Manager at DoiT International in Austin, TX.\n\nI hope this helps!"}
