# monitoring the monitoring tool

In [16]:
import os
import time
from opensearchpy import OpenSearch
from watchdog.observers import Observer
# from watchdog.events import FileSystemEventHandler
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
# from db_sqlite import initialize_database, log_event_to_db
# from db_opensearch import generate_embeddings

from urllib.request import urlopen
from io import BytesIO

In [17]:
INDEX_NAME = "documents_test"

In [18]:
def load_credentials(file_path):
    credentials = {}
    try:
        with open(file_path, 'r') as file:
            for line in file:
                key, value = line.strip().split('=')
                credentials[key] = value
    except Exception as e:
        print(f"An error occurred while reading credentials: {e}")
    return credentials

def configure_opensearch_client(credentials):
    host = credentials.get('host', 'localhost')
    port = int(credentials.get('port', 9200))
    auth = (credentials.get('username'), credentials.get('password'))
    
    client = OpenSearch(
        hosts=[{'host': host, 'port': port}],
        http_auth=auth,
        # use_ssl=True,
        # verify_certs=False,
        # ssl_show_warn=False
    )
    return client

credentials = load_credentials('credentials.txt')
opensearch_client = configure_opensearch_client(credentials)

In [19]:
index_body = {
    "mappings": {
        "properties": {
            "content": {
                "type": "text"
            },
            "file_path": {
                "type": "keyword"
            }
        }
    }
}

In [20]:
opensearch_client.indices.delete(index='documents_test')

{'acknowledged': True}

In [21]:
if not opensearch_client.indices.exists(index=INDEX_NAME):
    opensearch_client.indices.create(index=INDEX_NAME, body=index_body)

In [22]:
def extract_text_from_pdf(file_path):
    try:
        # First attempt: Directly read the file
        reader = PdfReader(file_path)
        text = "".join(page.extract_text() or "" for page in reader.pages)
        return text
    except Exception as e:
        print(f"Direct read failed for {file_path}: {e}")

    try:
        # Second attempt: Read the file as a URL stream
        full_file_path = os.path.abspath(file_path)
        pdf_url = f"file://{full_file_path}"
        pdf_file = urlopen(pdf_url).read()
        pdf_bytes_stream = BytesIO(pdf_file)
        reader = PdfReader(pdf_bytes_stream)
        text = "".join(page.extract_text() or "" for page in reader.pages)
        return text
    except Exception as e:
        print(f"URL stream read failed for {file_path}: {e}")
        return ""


def extract_text_from_word(file_path):
    try:
        doc = DocxDocument(file_path)
        text = "\n".join(para.text for para in doc.paragraphs)
        return text
    except Exception as e:
        print(f"Error extracting text from Word document {file_path}: {e}")
        return ""

In [25]:
# Indexing function
def index_document(file_path):
    
    if file_path.endswith(".pdf"):
        content = extract_text_from_pdf(file_path)
        # vector = generate_embeddings(content)
    elif file_path.endswith(".docx"):
        content = extract_text_from_word(file_path)
        # vector = generate_embeddings(content)
    else:
        print(f"Unsupported file type: {file_path}")
        content = ""
        # vector = []
    
    # vector = generate_embeddings(content)

    # Index the document in OpenSearch
    document = {
        "content": content,
        # "content_vector": vector,
        "file_path": file_path
    }
    doc_id = file_path
    opensearch_client.index(index=INDEX_NAME, id=doc_id, body=document)
    # print(f"indexed document: {file_path}")

In [24]:
def check_and_index_existing_documents(directory):
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                # Check if document is already indexed
                response = opensearch_client.get(index=INDEX_NAME, id=file_path, ignore=404)
                if response.get('found', False):
                    print(f"Document already indexed: {file_path}")
                else:
                    # log_event_to_db("created", file_path)
                    index_document(file_path)
            except Exception as e:
                print(f"Error checking/indexing document {file_path}: {e}")

In [26]:
directory = "../documents/GDPRAllerlei_903"
check_and_index_existing_documents(directory)

Unsupported file type: ../documents/GDPRAllerlei_903/medic-update-checklist.xlsx
Unsupported file type: ../documents/GDPRAllerlei_903/EHR_WG_Calendar_for_20190916_WGM_Atlanta_20190813.xlsx
Unsupported file type: ../documents/GDPRAllerlei_903/membershipformupdatedjan2019.xlsx
Unsupported file type: ../documents/GDPRAllerlei_903/2019-20 Action Plan - Customers.xlsx


KeyboardInterrupt: 

## isolate PDF loading issue