In [10]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import os

## Embeddings Creation

In [2]:
pdf_path = "C:\\Users\\User\\PycharmProjects\\pums\\PUMS_Data_Dictionary_2018-2022.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()

# 📄 Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(pages)

# 🧠 Embed into ChromaDB
vectorstore = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory="./pums_vectorstore")

  vectorstore = Chroma.from_documents(docs, embedding=OpenAIEmbeddings(), persist_directory="./pums_vectorstore")


## Load the embeddings

In [9]:
vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings(),
    persist_directory="./pums_vectorstore"
)

retriever = vectorstore.as_retriever()

  vectorstore.persist()


In [11]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 20})

In [7]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-4"),  # or gpt-3.5-turbo
    retriever=retriever,
    return_source_documents=True
)

# 🧪 Ask a question
query = "What does Occupation mean?"
result = qa_chain({"query": query})

print("Answer:")
print(result["result"])

Answer:
In this context, occupation refers to the job or profession that a person has. It is often classified using Standard Occupational Classification (SOC) codes or Occupation Recode (OCC) codes, which categorize different types of jobs. For example, the code "1110XX" or "0010" refers to a manager who is a chief executive or legislator, while "112011" or "0040" refers to a manager in advertising and promotions.


In [8]:
query = "What are the fields that can indicate income?"
result = qa_chain({"query": query})

print("Answer:")
print(result["result"])

Answer:
The fields that can indicate income are:

1. HINCP: Household income (past 12 months)
2. OIP: All other income past 12 months
3. PAP: Public assistance income past 12 months
4. INTP: Interest, dividends, and net rental income past 12 months
5. SEMP: Self-employment income past 12 months
6. SSIP: Supplementary Security Income past 12 months
7. SSP: Social Security income past 12 months
8. WAGP: Wages or salary income past 12 months
9. EARN: Total person's earnings 
10. PINCP: Total person's income.


In [12]:
query = "What are the fields that I need to manual adjustment?"
result = qa_chain({"query": query})

print("Answer:")
print(result["result"])

Answer:
Based on the provided context, the fields that require manual adjustment are:

1. SEMP (Self-employment income past 12 months) - Use ADJINC to adjust SEMP to constant dollars.
2. SSIP (Supplementary Security Income past 12 months) - Use ADJINC to adjust SSIP to constant dollars.
3. TAXAMT (Property taxes - yearly real estate taxes) - Use ADJHSG to adjust TAXAMT to constant dollars.
4. MHP (Mobile home costs - yearly amount) - Use ADJHSG to adjust MHP to constant dollars.
