### Import

In [None]:
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

### Environment

In [None]:
from dotenv import load_dotenv
load_dotenv("../.env")

path = os.environ.get("DOC_PATH")

### Parse PDF Drawings and Save Extracted Text to Files

In [None]:
def extract_text_from_pdfs(input_dir: str, output_dir: str):
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for pdf_file in input_path.glob("*.pdf"):
        print(f"Processing: {pdf_file.name}")
        loader = PyPDFLoader(
            str(pdf_file),
            mode = "single",
            pages_delimiter = "\n\f",
            extraction_mode = "layout",
            extraction_kwargs={"layout_mode_strip_rotated": False}
        )

        documents = loader.load()
        
        # Join all text from pages
        full_text = "\n".join(doc.page_content for doc in documents)
        
        # Create corresponding .txt file
        txt_file = output_path / (pdf_file.stem + ".txt")
        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(full_text)
        print(f"Saved to: {txt_file}")

In [None]:
input_dir = path
output_dir = os.path.join(input_dir, "TXT")
print(output_dir)

In [None]:
extract_text_from_pdfs(input_dir, output_dir)

### Extract Equipment Tags

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize
# import string
# from nltk.corpus import stopwords


# nltk.download("punkt")
# nltk.download("stopwords")

In [None]:
def process_text_file(file_path: str) -> set:
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Tokenize
    tokens = word_tokenize(text)

    # Convert to lowercase, remove punctuation and stopwords
    # stop_words = set(stopwords.words("english"))
    # punctuation = set(string.punctuation)

    
    # extract equipment tags from drawings
    pattern = r"(.+-){3,}.+"
    filtered_tokens = set(
        word.upper()
        for word in tokens
        if not word.isalpha() and re.fullmatch(pattern, word)
    )
    
    return filtered_tokens

In [None]:
def extract_token_from_textfiles(input_dir: str):
    text_path = Path(input_dir)

    for txt_file in text_path.glob("*.txt"):
        print(f"\nProcessing: {txt_file.name}\n")
        tokens = process_text_file(str(txt_file))
        for token in tokens:
            print(token)


In [None]:
extract_token_from_textfiles(output_dir)