In [1]:
import os
import json
import pdfplumber
from pymongo import MongoClient
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import OpenAIEmbeddings

# Set your OpenAI API Key
os.environ["OPENAI_API_KEY"] = "your_key"

# Initialize MongoDB client
def init_mongo():
    client = MongoClient("mongodb://localhost:27017/")
    db = client["documentDB"]
    return db["extracted_info"]

# Load all PDF files from a folder and return list of dicts
def load_pdf_documents(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            with pdfplumber.open(os.path.join(folder_path, filename)) as pdf:
                text = "\n".join([page.extract_text() or "" for page in pdf.pages])
                documents.append({
                    "filename": filename,
                    "content": text,
                    "raw_text": text  # For future reference/traceability
                })
    return documents

# Initialize LangChain tools
def init_llm_tools():
    llm = ChatOpenAI(temperature=0)
    embeddings = OpenAIEmbeddings()
    prompt = PromptTemplate.from_template("""
    Extract the following details from the text:
    - Name
    - City
    - Phone Number
    - Email Address
    - Skills (comma-separated list)

    Text:
    {text}

    Output format (JSON):
    {{"name": "", "city": "", "phone": "", "email": "", "skills": ""}}
    """)
    chain = LLMChain(llm=llm, prompt=prompt)
    return chain, embeddings

# Extract structured info using LLM and generate embeddings
def process_document(doc, chain, embeddings):
    try:
        # Step 1: Run LLM to extract metadata
        result = chain.run({"text": doc["content"]})
        extracted = json.loads(result)

        # Step 2: Add metadata to doc
        extracted.update({
            "filename": doc["filename"],
            "content": doc["content"],
            "raw_text": doc["raw_text"]
        })

        # Step 3: Embeddings
        extracted["full_pdf_embedding"] = embeddings.embed_query(doc["content"])
        city_skill_string = f"{extracted.get('city', '')}, {extracted.get('skills', '')}"
        extracted["city_skill_embedding"] = embeddings.embed_query(city_skill_string)

        return extracted
    except Exception as e:
        print(f"Failed to process {doc['filename']}: {e}")
        return None

# Insert processed document into MongoDB
def insert_into_mongo(collection, processed_doc):
    if processed_doc:
        collection.insert_one(processed_doc)
        print(f"Inserted: {processed_doc['name']}")

# Main pipeline
def run_pipeline(folder_path):
    collection = init_mongo()
    docs = load_pdf_documents(folder_path)
    chain, embeddings = init_llm_tools()

    for doc in docs:
        structured_data = process_document(doc, chain, embeddings)
        insert_into_mongo(collection, structured_data)
# Run it
if __name__ == "__main__":
    folder = "C:/Users/tksen/Desktop/Agents/short_profile/"
    run_pipeline(folder)