In [None]:
import os
import fitz  # PyMuPDF
from PIL import Image
import io
import pytesseract
import json
import csv
import pandas as pd 
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Path to the dataset folder
DATASET_PATH = r"C:\Users\LEGION\Desktop\Project\AI351\PROJECT\Dataset"

# DATASET_PATH = r"C:\Users\LEGION\Desktop\Project\AI351\PROJECT\Dataset\Services/"
# Path to save extracted raw text and metadata
EXTRACTION_OUTPUT_PATH = r"C:\Users\LEGION\Desktop\Project\AI351\PROJECT\extracted_text.json"
# Path to the CSV file containing additional metadata
CSV_PATH = r"C:\Users\LEGION\Desktop\Project\AI351\PROJECT\data_dict.csv"

# Chunking parameters
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100



def clean_text(text):
    """Clean the extracted text by removing unwanted characters."""
    import re
    # Replace multiple spaces and newlines with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    return text


def extract_text_from_pdfs(folder_path, output_path):
    """Extract text and metadata from PDFs and save them to a file."""
    extracted_data = []
    additional_metadata = pd.read_csv('data_dict.csv')
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                print(file)
                pdf_path = os.path.join(root, file)
                folder_name = os.path.basename(root)
                print(pdf_path)
                # Get additional metadata from CSV if available
                 
                
                doc = fitz.open(pdf_path)
                for page_num, page in enumerate(doc, start=1):
                    text = page.get_text()

                    # If no text, fallback to OCR
                    if not text.strip():
                        pix = page.get_pixmap()
                        img = Image.open(io.BytesIO(pix.tobytes("png")))
                        text = pytesseract.image_to_string(img)

                    # Clean the text
                    text = clean_text(text)
                    if text:  # Ensure there's meaningful text after cleaning
                        metadata = {
                            "source": pdf_path,
                            "folder": folder_name,
                            "file_name": file,
                            "page": page_num,
                            "title": additional_metadata['Title'][additional_metadata['File Name']==file].iloc[0],
                            "url": additional_metadata['Link'][additional_metadata['File Name']==file].iloc[0]
                        }
                        extracted_data.append({
                            "text": text,
                            **metadata
                        })

    # Save extracted data to a JSON file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(extracted_data, f, indent=4)

    print(f"Extracted data saved to {output_path}")

def load_and_chunk_text(input_path, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    """Load text and metadata from a file and perform chunking."""
    with open(input_path, "r", encoding="utf-8") as f:
        extracted_data = json.load(f)

    chunks = []
    metadata = []

    for entry in extracted_data:
        text = entry["text"]
        source_metadata = {
            "source": entry["source"],
            "folder": entry["folder"],
            "file_name": entry["file_name"],
            "page": entry["page"],
            "title": entry["title"],
            "url": entry["url"]
        }

        # Chunk the text
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            chunks.append(chunk)
            metadata.append(source_metadata)
            start += chunk_size - chunk_overlap

    return chunks, metadata



# Step 2: Extract text and metadata from PDFs and save to a file
print("Extracting text from PDFs...")
extract_text_from_pdfs(DATASET_PATH, EXTRACTION_OUTPUT_PATH)

# Step 3: Load the extracted data and perform chunking
print("Loading extracted data and performing chunking...")
chunks, metadatas = load_and_chunk_text(EXTRACTION_OUTPUT_PATH)

# Output for debugging (optional)
print(f"Extracted {len(chunks)} text chunks.")
print(f"Sample Metadata: {metadatas[:1]}")
