# monitoring the monitoring tool

In [1]:
import os
import time
from opensearchpy import OpenSearch
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
from db_sqlite import initialize_database, log_event_to_db
from db_opensearch import generate_embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
INDEX_NAME = "documents_test"

In [2]:
def load_credentials(file_path):
    credentials = {}
    try:
        with open(file_path, 'r') as file:
            for line in file:
                key, value = line.strip().split('=')
                credentials[key] = value
    except Exception as e:
        print(f"An error occurred while reading credentials: {e}")
    return credentials

def configure_opensearch_client(credentials):
    host = credentials.get('host', 'localhost')
    port = int(credentials.get('port', 9200))
    auth = (credentials.get('username'), credentials.get('password'))
    
    client = OpenSearch(
        hosts=[{'host': host, 'port': port}],
        http_auth=auth,
        # use_ssl=True,
        # verify_certs=False,
        # ssl_show_warn=False
    )
    return client

credentials = load_credentials('credentials.txt')
opensearch_client = configure_opensearch_client(credentials)

In [6]:
index_body = {
    "mappings": {
        "properties": {
            "content": {
                "type": "text"
            },
            "file_path": {
                "type": "keyword"
            }
        }
    }
}

In [10]:
opensearch_client.indices.delete(index='documents_test')

{'acknowledged': True}

In [11]:
if not opensearch_client.indices.exists(index=INDEX_NAME):
    opensearch_client.indices.create(index=INDEX_NAME, body=index_body)

In [12]:
# Text extraction functions
def extract_text_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        text = "".join(page.extract_text() for page in reader.pages)
        return text
    except Exception as e:
        print(f"Error extracting text from PDF {file_path}: {e}")
        return ""

def extract_text_from_word(file_path):
    try:
        doc = DocxDocument(file_path)
        text = "\n".join(para.text for para in doc.paragraphs)
        return text
    except Exception as e:
        print(f"Error extracting text from Word document {file_path}: {e}")
        return ""

In [13]:
def index_document(file_path):
    
    if file_path.endswith(".pdf"):
        content = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        content = extract_text_from_word(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return
    
    # vector = generate_embeddings(content)

    # Index the document in OpenSearch
    document = {
        "content": content,
        # "content_vector": vector,
        "file_path": file_path
    }
    doc_id = file_path
    opensearch_client.index(index=INDEX_NAME, id=doc_id, body=document)
    print(f"indexed document: {file_path}")

In [14]:
def check_and_index_existing_documents(directory):
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            
            # this is already ignoring other file types, but we want to see them in the index
            if file_path.endswith((".pdf", ".docx")):  # Only process supported file types
                try:
                    # Check if document is already indexed
                    response = opensearch_client.get(index=INDEX_NAME, id=file_path, ignore=404)
                    if response.get('found', False):
                        print(f"Document already indexed: {file_path}")
                    else:
                        log_event_to_db("created", file_path)
                        index_document(file_path)
                except Exception as e:
                    print(f"Error checking/indexing document {file_path}: {e}")

In [15]:
directory = "../documents/GDPRAllerlei_903"
check_and_index_existing_documents(directory)

indexed document: ../documents/GDPRAllerlei_903/covid19theimpactofthehealthemergencyonitalianpriva.pdf
indexed document: ../documents/GDPRAllerlei_903/Privacy_Notice_AssurPharma_NL.pdf
indexed document: ../documents/GDPRAllerlei_903/Privacy Challenges Against FATCA Face Uphill Battle.pdf
indexed document: ../documents/GDPRAllerlei_903/AVG-infoboekje-Privacy-op-School.pdf
indexed document: ../documents/GDPRAllerlei_903/Binding Effects of the European General Data Protection Regulatio.pdf
indexed document: ../documents/GDPRAllerlei_903/8661641.pdf
indexed document: ../documents/GDPRAllerlei_903/20200608-privacyverklaring-website.pdf
indexed document: ../documents/GDPRAllerlei_903/EP_Council_Comparison.pdf
indexed document: ../documents/GDPRAllerlei_903/915-180907-AVG-Privacy-Statement-v1.pdf
indexed document: ../documents/GDPRAllerlei_903/guide-to-the-general-data-protection-regulation-gdpr-1-0.pdf
indexed document: ../documents/GDPRAllerlei_903/2020-06-15_PhD_droughtmarkers.pdf
indexed 



indexed document: ../documents/GDPRAllerlei_903/edpb_guidelines_20200420_contact_tracing_covid_with_annex_en (2).pdf
indexed document: ../documents/GDPRAllerlei_903/Brexit FAQ November update.pdf
indexed document: ../documents/GDPRAllerlei_903/GDPR.docx
indexed document: ../documents/GDPRAllerlei_903/an-introduction-to-the-general-data-protection-regulation.pdf
indexed document: ../documents/GDPRAllerlei_903/A-Guide-to-help-SMEs-Prepare-for-the-GDPR.pdf
indexed document: ../documents/GDPRAllerlei_903/00-Privacyverklaring.pdf
indexed document: ../documents/GDPRAllerlei_903/accelerate-response-with-oracle-cloud-wp.pdf
indexed document: ../documents/GDPRAllerlei_903/The-European-Union-and-the-Search-for-Digital-Sovereignty-Building-Fortress-Europe-or-Preparing-for-a-New-World.pdf
indexed document: ../documents/GDPRAllerlei_903/Whats_App_Monitoring_Employees_in_the_Smartphone_Age.pdf
indexed document: ../documents/GDPRAllerlei_903/Weten of vergeten handreiking AVG Archiefwet.pdf
indexed do

Multiple definitions in dictionary at byte 0x11a for key /GW_WM0
Multiple definitions in dictionary at byte 0x1ba9 for key /GW_WM0
Multiple definitions in dictionary at byte 0x2f57 for key /GW_WM0
Multiple definitions in dictionary at byte 0x49f6 for key /GW_WM0
Multiple definitions in dictionary at byte 0x5fbe for key /GW_WM0
Multiple definitions in dictionary at byte 0x7592 for key /GW_WM0


indexed document: ../documents/GDPRAllerlei_903/Brochure-Aankoop-boter-voor-openbare-opslag.pdf
indexed document: ../documents/GDPRAllerlei_903/data-processing-addendum-nl.pdf
indexed document: ../documents/GDPRAllerlei_903/GDPR-Open-Letter-m.pdf
indexed document: ../documents/GDPRAllerlei_903/GDPR QA April 2018.pdf
indexed document: ../documents/GDPRAllerlei_903/19-11-06_opinion_on_e_evidence_proposals_en.pdf
indexed document: ../documents/GDPRAllerlei_903/EU_General_Data_Protection_Regulation.pdf
indexed document: ../documents/GDPRAllerlei_903/jaarverslag-2018-nhl-stenden-hogeschool.pdf
indexed document: ../documents/GDPRAllerlei_903/privacyverklaring-algemene-verordening-gegevensbescherming-avg.pdf
indexed document: ../documents/GDPRAllerlei_903/GDPR_FAQ (1).pdf
indexed document: ../documents/GDPRAllerlei_903/Privacy-Statement-Make-A-Wish-Belgium-Vlaanderen.pdf
indexed document: ../documents/GDPRAllerlei_903/AVG-KLASBORD-v3.pdf
indexed document: ../documents/GDPRAllerlei_903/guide-t