In [170]:
import os
import requests
import fitz
from langchain.document_loaders import UnstructuredPowerPointLoader, Docx2txtLoader
import sys
sys.path.append("../../../")
from models.models import Document

In [171]:
BEARER_TOKEN = os.environ.get("BEARER_TOKEN") 
endpoint_url = 'http://localhost:3333'
headers = {
    "Authorization": f"Bearer {BEARER_TOKEN}"
}

In [172]:
def load_file(file_path: str) -> Document:
    """input a azure blob name, loader file and return Document object

    Args:
        file_path (str): _description_

    Raises:
        ValueError: _description_

    Returns:
        _type_: _description_
    """
    ## TODO: get filename, source, source_id, document_id, created_at, blob_url, contact_person, summary
    filename = os.path.basename(file_path)
    if file_path.endswith(".pptx") or file_path.endswith(".ppt"):
        loader = UnstructuredPowerPointLoader(file_path)
        text = loader.load()[0].page_content
    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
        loader = Docx2txtLoader(file_path)
        text = loader.load()[0].page_content
    elif file_path.endswith(".pdf"):
        doc = fitz.open(file_path)
        text = ""
        page_splitter = "--PAGE_SPLITTER--"
        for page in doc:
            text +=  page.get_text("text", sort=True) + page_splitter
    else:
        raise ValueError("Unsupported file type")

    return Document(text=text, metadata={"filename": filename})
    
  

In [173]:
pdf_doc = load_file("../../../data/pptexamples.pdf")
ppt_doc = load_file("../../../data/pptexamples.pptx")
docx_doc = load_file("../../../data/docexample.docx")

In [175]:
response = requests.post(
    f"{endpoint_url}/upsert",
    headers=headers,
    json={
        "documents": [pdf_doc.dict(), docx_doc.dict(), docx_doc.dict()]
    }
)
response.raise_for_status()