In [1]:
import os
import pandas as pd
from langchain_core.documents import Document
from langchain_text_splitters  import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv
from langchain_community.document_loaders import CSVLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
load_dotenv(override=True)


# 1) Load & sanitize
df = pd.read_csv("joblist_clean_for_rag.csv")
df = df.fillna("").reset_index(drop=False).rename(columns={"index": "doc_id"})
df["doc_id"] = df["doc_id"].astype(str)

def row_to_doc(row):
    content = (
        f"Title: {row['title']}\n"
        f"Company: {row['company']}\n"
        f"Location: {row['location']}\n"
        f"Remote: {row['remote']}\n"
        f"Department: {row['department']}\n"
        f"Description:\n{row['description']}"
    ).strip()
    metadata = {
        "doc_id": row["doc_id"],
        "title": row["title"],
        "company": row["company"],
        "location": row["location"],
        "remote": row["remote"],
        "department": row["department"],
    }
    return Document(page_content=content, metadata=metadata)

base_docs = [row_to_doc(r) for _, r in df.iterrows()]
print(f"Loaded {len(base_docs)} base docs")

# 2) Chunk (so each embed is small)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,     # ~800–1200 chars is usually fine
    chunk_overlap=150,   # small overlap for context continuity
    separators=["\n\n", "\n", " ", ""],
)

chunked_docs = []
for d in base_docs:
    for i, ch in enumerate(splitter.split_text(d.page_content)):
        md = dict(d.metadata)
        md["chunk"] = i
        chunked_docs.append(Document(page_content=ch, metadata=md))

print(f"Prepared {len(chunked_docs)} chunks")

# 3) Embeddings (use the small model to cut cost/size)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
#embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
# 4) Chroma store
vector_store = Chroma(
    collection_name="jobs_rag",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",
)

# 5) Batch add to avoid per-request token overflow
BATCH_SIZE = 100  # adjust if you still hit limits; smaller => safer
ids = []
for i in range(0, len(chunked_docs), BATCH_SIZE):
    batch = chunked_docs[i:i+BATCH_SIZE]
    ids.extend(vector_store.add_documents(batch))
    # optional: print progress
    # print(f"Indexed {i+len(batch)}/{len(chunked_docs)}")

print("Example IDs:", ids[:3])
print("Done indexing.")


  from .autonotebook import tqdm as notebook_tqdm


Loaded 2103 base docs
Prepared 12684 chunks
Example IDs: ['4188f152-ff0c-432f-b179-5bd032c38916', '96864f78-1b7d-451e-95e2-367b7f092fb2', '26eddbca-3d25-4127-aebd-532aabf6c81a']
Done indexing.


In [3]:
# 6) Retrieval example
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

query = "senior Python backend engineer with experience in low-latency trading"
results = retriever.invoke(query)
for r in results:
    print("-----")
    print(r.metadata.get("doc_id"), "|", r.metadata.get("title"), "|", r.metadata.get("company"), "|", r.metadata.get("location"))
    print(r.page_content[:400], "...")


-----
373 | Staff Backend Engineer (Python), AI Engineering: Agent Foundations  | gitlab | Remote, EMEA
Multiple years of experience building Python Web Services in a multi-service environment ( e.g. FastAPI, Starlette)
Proven ability to architect and lead complex technical projects from concept to production across distributed systems
Strong experience with AI/ML technologies, including integration with large language models and understanding of prompt engineering
Deep expertise in performance opti ...
-----
1667 | Software Engineer - Backend & Scalability | datadog | Paris, France
What You’ll Do: 
Build and maintain reliable backend services that process and store large volumes of data
Contribute to internal platforms, APIs, and tools that power distributed products
Collaborate with experienced engineers to design and implement scalable system components
Ship production code in languages like Go, Python, or Java
Learn and apply distributed system patterns through hands-on d ...
-----

In [6]:
# Index df by doc_id for fast lookup

from langchain.agents import create_agent
agent = create_agent(
        model="google_genai:gemini-2.5-flash-lite",
        system_prompt="""I will give you a job description, please quickly summarize this job in 200 words,reply like this in json format:
         Company:
         Job Title:
         Remote:yes/not
         Decription:
         Requirements:   
         Email:
         """
    )
df_by_id = df.set_index("doc_id", drop=False)

def format_full_row(row):
    return (
        f"Title: {row['title']}\n"
        f"Company: {row['company']}\n"
        f"Location: {row['location']}\n"
        f"Remote: {row['remote']}\n"
        f"Department: {row['department']}\n"
        f"Description:\n{row['description']}"
    ).strip()

def retrieve_full_jobs_by_id(query, k=5):
    chunks = retriever.invoke(query)
    seen = set()
    full = []
    for ch in chunks:
        did = ch.metadata["doc_id"]
        if did in seen:
            continue
        seen.add(did)
        row = df_by_id.loc[did]
        full.append(format_full_row(row))
        if len(full) >= k:
            break
    return full

# Example
full_jobs = retrieve_full_jobs_by_id("oracle xstore pos rollout lead", k=3)
for job in full_jobs:
    result = agent.invoke(
    {"messages": [{"role": "user", "content": job}]})
    job_summary = result['messages'][-1].content
    print(job_summary)


```json
{
  "Company": "valtech",
  "Job Title": "Solution Architect - Xstore",
  "Remote": "not",
  "Description": "Valtech is seeking a Solution Architect with 10+ years of experience to drive the architecture of Oracle Xstore POS and its ecosystem. This role involves defining end-to-end store system architecture, integrating with various platforms (ERP, CRM, e-commerce), and providing technical leadership. The architect will translate business requirements into scalable, secure, and future-ready solutions, conduct Proofs of Concept, and lead localization strategies. They will also support pre-sales activities and mentor project teams, acting as a trusted advisor to clients on retail transformation initiatives. The position emphasizes a growth mindset, collaboration, and a passion for experience innovation.",
  "Requirements": "Bachelor's/Master's degree in Computer Science/Engineering or related field. 10-15 years IT experience with 5+ years as a Solution Architect in retail. Strong