<a href="https://colab.research.google.com/github/thegoodgamer14/tech-spec-extractor/blob/main/corpus_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Setting up...")
!pip install pypdf2 tqdm

Setting up...
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [2]:
import os
import re
from google.colab import drive
import PyPDF2
import logging
from tqdm.notebook import tqdm
import time
from pathlib import Path

In [3]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
print("Setup complete.")

Setup complete.


In [4]:
print("Configuring paths...")
# Base paths
DRIVE_MOUNT_POINT = '/content/drive'
DRIVE_OUTPUT_BASE = f'{DRIVE_MOUNT_POINT}/MyDrive/tech-spec-extractor/data/processed'

# Source PDF Folders
MECH_DOCS_FOLDER = f'{DRIVE_MOUNT_POINT}/MyDrive/tech-spec-extractor/data/raw/mechanical-docs'
ELEC_DOCS_FOLDER = f'{DRIVE_MOUNT_POINT}/MyDrive/tech-spec-extractor/data/raw/electrical-docs'

# Output Text Folders
MECH_OUTPUT_FOLDER = f'{DRIVE_OUTPUT_BASE}/mechanical-text'
ELEC_OUTPUT_FOLDER = f'{DRIVE_OUTPUT_BASE}/electrical-text'

# Create output directories if they don't exist
os.makedirs(MECH_OUTPUT_FOLDER, exist_ok=True)
os.makedirs(ELEC_OUTPUT_FOLDER, exist_ok=True)

print(f"Drive Mount Point: {DRIVE_MOUNT_POINT}")
print(f"Source PDF Folders: \n  - {MECH_DOCS_FOLDER}\n  - {ELEC_DOCS_FOLDER}")
print(f"Output Text Folders: \n  - {MECH_OUTPUT_FOLDER}\n  - {ELEC_OUTPUT_FOLDER}")
print("Configuration complete.")

Configuring paths...
Drive Mount Point: /content/drive
Source PDF Folders: 
  - /content/drive/MyDrive/tech-spec-extractor/data/raw/mechanical-docs
  - /content/drive/MyDrive/tech-spec-extractor/data/raw/electrical-docs
Output Text Folders: 
  - /content/drive/MyDrive/tech-spec-extractor/data/processed/mechanical-text
  - /content/drive/MyDrive/tech-spec-extractor/data/processed/electrical-text
Configuration complete.


In [5]:
print("Mounting Google Drive...")
try:
    # Clean up mount point if needed
    if os.path.exists(DRIVE_MOUNT_POINT) and os.listdir(DRIVE_MOUNT_POINT):
        print(f"Mount point {DRIVE_MOUNT_POINT} already has files. Using standard mounting approach.")
        drive.mount(DRIVE_MOUNT_POINT)
    else:
        drive.mount(DRIVE_MOUNT_POINT)

    logging.info("Google Drive mounted successfully.")

    # Verify paths exist
    path_issues = False
    if not os.path.isdir(MECH_DOCS_FOLDER):
        logging.warning(f"⚠️ Mechanical docs folder not found: {MECH_DOCS_FOLDER}")
        path_issues = True
    if not os.path.isdir(ELEC_DOCS_FOLDER):
        logging.warning(f"⚠️ Electrical docs folder not found: {ELEC_DOCS_FOLDER}")
        path_issues = True

    if path_issues:
        logging.warning("Please check your folder paths and try again.")
    else:
        logging.info("All source folders found.")

except Exception as e:
    logging.error(f"Error mounting Google Drive: {e}")
    raise e
print("Drive mounted.")

Mounting Google Drive...
Mount point /content/drive already has files. Using standard mounting approach.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive mounted.


In [6]:
print("Defining PDF extraction functions...")

def clean_text(text):
    """Performs advanced text cleaning on extracted PDF text."""
    # Replace common PDF artifacts
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)  # Fix hyphenated words across lines

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers (common formats)
    text = re.sub(r'\b\d+\s+of\s+\d+\b', '', text)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # Remove headers/footers (simplified approach)
    text = re.sub(r'^\s*.{0,30}(confidential|proprietary|all rights reserved).{0,30}$', '',
                 text, flags=re.MULTILINE|re.IGNORECASE)

    # Fix common measurement units that might get broken
    text = re.sub(r'(\d+)\s+([kKmMgG]?[WwVvAa])', r'\1\2', text)  # Fix "10 kW" -> "10kW"

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


Defining PDF extraction functions...


In [7]:
def segment_text(text):
    """Segments text into meaningful paragraphs."""
    # Split by periods followed by spaces and newlines, preserving the periods
    segments = []

    # First pass: split by paragraph markers (multiple newlines)
    paragraphs = re.split(r'\n\s*\n', text)

    for paragraph in paragraphs:
        if len(paragraph.strip()) > 0:
            # Clean paragraph
            paragraph = paragraph.strip()
            paragraph = re.sub(r'\s+', ' ', paragraph)

            # Skip very short segments (likely headers or artifacts)
            if len(paragraph.split()) >= 5:
                segments.append(paragraph)

    return segments

In [8]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a single PDF file using PyPDF2."""
    all_text = ""
    page_texts = []

    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)

            for page_num in range(num_pages):
                try:
                    page = reader.pages[page_num]
                    page_text = page.extract_text()

                    if page_text:  # Check if text extraction returned something
                        page_texts.append(page_text)
                except Exception as e:
                    logging.warning(f"Error extracting page {page_num}: {e}")

            # Process each page's text
            if page_texts:
                all_text = "\n\n".join(page_texts)
                all_text = clean_text(all_text)

    except FileNotFoundError:
        logging.error(f"PDF not found: {pdf_path}")
    except PyPDF2.errors.PdfReadError as pdf_err:
        logging.warning(f"PyPDF2 error reading {os.path.basename(pdf_path)}: {pdf_err}. Skipping file.")
    except Exception as e:
        logging.warning(f"Could not read PDF {os.path.basename(pdf_path)}: {e}. Skipping file.")

    return all_text

In [9]:
def process_pdf_folder(folder_path, output_folder):
    """Processes all PDFs in a folder, saving each to a separate text file."""

    # Get list of PDF files
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
    successful_extractions = 0
    failed_extractions = 0
    total_segments = 0

    if not pdf_files:
        logging.warning(f"No PDF files found in {folder_path}")
        return successful_extractions, failed_extractions, total_segments

    logging.info(f"Processing {len(pdf_files)} PDFs in: {folder_path}")

    # Process each PDF file with progress bar
    for filename in tqdm(pdf_files, desc="Extracting PDFs"):
        pdf_path = os.path.join(folder_path, filename)
        base_name = os.path.splitext(filename)[0]
        output_path = os.path.join(output_folder, f"{base_name}.txt")

        # Extract text from PDF
        doc_text = extract_text_from_pdf(pdf_path)

        # Check if substantial text was extracted
        if doc_text and len(doc_text.split()) > 10:
            # Segment the text
            segments = segment_text(doc_text)
            total_segments += len(segments)

            # Write segments to file
            try:
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write("\n\n".join(segments))
                successful_extractions += 1
            except Exception as e:
                logging.error(f"Error saving text from {filename}: {e}")
                failed_extractions += 1
        else:
            logging.warning(f"Minimal text extracted from {filename}. Skipping.")
            failed_extractions += 1

    return successful_extractions, failed_extractions, total_segments

print("PDF extraction functions defined.")

PDF extraction functions defined.


In [None]:
print("\n=== Starting PDF text extraction process ===\n")

# Statistics tracking
start_time = time.time()
stats = {
    'mechanical': {'success': 0, 'failed': 0, 'segments': 0},
    'electrical': {'success': 0, 'failed': 0, 'segments': 0}
}

try:
    # Process mechanical documents
    print("\n--- Processing Mechanical Documents ---")
    stats['mechanical']['success'], stats['mechanical']['failed'], stats['mechanical']['segments'] = process_pdf_folder(
        MECH_DOCS_FOLDER, MECH_OUTPUT_FOLDER
    )

    # Process electrical documents
    print("\n--- Processing Electrical Documents ---")
    stats['electrical']['success'], stats['electrical']['failed'], stats['electrical']['segments'] = process_pdf_folder(
        ELEC_DOCS_FOLDER, ELEC_OUTPUT_FOLDER
    )

    # Calculate total processing time
    total_time = time.time() - start_time

except Exception as e:
    logging.error(f"An error occurred during extraction: {e}")
    raise e

In [11]:
print("\n=== Extraction Results ===\n")

# Count number of text files generated
mech_text_files = len([f for f in os.listdir(MECH_OUTPUT_FOLDER) if f.endswith('.txt')])
elec_text_files = len([f for f in os.listdir(ELEC_OUTPUT_FOLDER) if f.endswith('.txt')])

# Report statistics
print(f"Processing completed in {total_time:.2f} seconds")
print("\nMechanical Documents:")
print(f"  - Successful extractions: {stats['mechanical']['success']}")
print(f"  - Failed extractions: {stats['mechanical']['failed']}")
print(f"  - Total text segments extracted: {stats['mechanical']['segments']}")
print(f"  - Text files generated: {mech_text_files}")

print("\nElectrical Documents:")
print(f"  - Successful extractions: {stats['electrical']['success']}")
print(f"  - Failed extractions: {stats['electrical']['failed']}")
print(f"  - Total text segments extracted: {stats['electrical']['segments']}")
print(f"  - Text files generated: {elec_text_files}")

print("\nTotal:")
total_success = stats['mechanical']['success'] + stats['electrical']['success']
total_failed = stats['mechanical']['failed'] + stats['electrical']['failed']
total_segments = stats['mechanical']['segments'] + stats['electrical']['segments']
total_files = mech_text_files + elec_text_files

print(f"  - Successful extractions: {total_success}")
print(f"  - Failed extractions: {total_failed}")
print(f"  - Total text segments extracted: {total_segments}")
print(f"  - Total text files generated: {total_files}")


=== Extraction Results ===

Processing completed in 313.15 seconds

Mechanical Documents:
  - Successful extractions: 30
  - Failed extractions: 0
  - Total text segments extracted: 30
  - Text files generated: 30

Electrical Documents:
  - Successful extractions: 27
  - Failed extractions: 3
  - Total text segments extracted: 27
  - Text files generated: 27

Total:
  - Successful extractions: 57
  - Failed extractions: 3
  - Total text segments extracted: 57
  - Total text files generated: 57


In [12]:
print("\n=== Sample Content Verification ===\n")

def check_text_sample(folder, count=3, max_lines=5):
    """Checks and displays samples from text files"""
    text_files = [f for f in os.listdir(folder) if f.endswith('.txt')]

    if not text_files:
        print(f"No text files found in {folder}")
        return

    # Select up to 'count' files
    sample_files = text_files[:min(count, len(text_files))]

    for sample_file in sample_files:
        file_path = os.path.join(folder, sample_file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.read().split('\n')

            # Get file size
            file_size = os.path.getsize(file_path) / 1024  # in KB

            print(f"File: {sample_file} ({file_size:.2f} KB)")
            print(f"First {min(max_lines, len(lines))} lines:")
            for i, line in enumerate(lines[:max_lines]):
                print(f"  {i+1}: {line[:100]}{'...' if len(line) > 100 else ''}")
            print()
        except Exception as e:
            print(f"Error reading {sample_file}: {e}")

print("Sample Mechanical Text Files:")
check_text_sample(MECH_OUTPUT_FOLDER)

print("\nSample Electrical Text Files:")
check_text_sample(ELEC_OUTPUT_FOLDER)

print("\n--- PDF Text Extraction Complete ---")
print(f"Mechanical texts saved to: {MECH_OUTPUT_FOLDER}")
print(f"Electrical texts saved to: {ELEC_OUTPUT_FOLDER}")


=== Sample Content Verification ===

Sample Mechanical Text Files:
File: General-Mechanical-Specification-Oct-15eeae.txt (68.16 KB)
First 1 lines:
  1: GENERAL MECHANICAL INSTALLATION SPECIFICATION Revised: October 2015 General Mechanical Installation ...

File: Annex 8 - Service specifications_3885-MECH SPECS-T1-2023.01.20.txt (78.47 KB)
First 1 lines:
  1: ■ website: www.edisonconsultants.com ■ office email: info@edisonconsultants.com ■ project manager: e...

File: T__proc_notices_notices_075_k_notice_doc_71173_343055755.txt (168.54 KB)
First 1 lines:
  1: Mechanical Specification ANNEX VIII Page VOLUME 2 TECHNICAL SPECIFICATIONS PART 2 MECHANICAL ENGINEE...


Sample Electrical Text Files:
File: ELECTRICAL-SEW2012.txt (155.91 KB)
First 1 lines:
  1: MINISTRY OF WORKS AND HUMAN SETTLEMENT DEPARTMENT OF ENGINEERING SERVICES SPECIFICATION FOR ELEC TRI...

File: Electrical Engineering Design Guide (1).txt (172.80 KB)
First 1 lines:
  1: ADNOC Classification: Public THE CONTENTS OF THIS 