In [None]:
!which python

In [None]:
# Function to get file paths from the user
def get_file_paths():
    print("Enter the paths of the files you want to process (PNG or PDF).")
    print("Press Enter after each file. When you're done, just press Enter without typing a file path.")
    
    file_paths = []
    while True:
        file_path = input("Enter file path (or press Enter to finish): ").strip()
        if not file_path:  # Stop when the user presses Enter without input
            break
        if file_path.lower().endswith(('.png', '.pdf')):  # Accept only PNG or PDF files
            file_paths.append(file_path)
        else:
            print("Invalid file type. Please provide a PNG or PDF file.")
    
    return file_paths

# Call the function and store the selected file paths
global files_to_process  # Make the variable global so it can be accessed in other cells
files_to_process = get_file_paths()

# Display the selected files
if files_to_process:
    print("Selected files:")
    for file in files_to_process:
        print(file)
else:
    print("No files selected.")


### Instructions for Running the Notebook

#### Overview
This notebook processes PNG and PDF files to extract specific features. If a PDF is provided, it will first be converted into PNG images, with each page saved as a separate PNG. The resulting PNGs will then be processed alongside any other provided PNG files.

#### Workflow
1. **Input Files**:
   - Users can upload PNG and PDF files.
   - If a PNG is provided, it will be processed directly without modification.
   - If a PDF is provided, it will be converted into PNG images, one for each page.

2. **PDF to PNG Conversion**:
   - **Library Used**: The `pdf2image` library is used for converting PDF pages into PNG images.
   - **Naming Convention**: Each page of the PDF is converted to a PNG file and saved in the same directory as the PDF, within a folder named `converted_files`. The naming format is:
     ```
     <original_filename>_convertedPNG_pg<page_number>.png
     ```
     For example, a PDF named `document.pdf` with 3 pages will generate:
     ```
     converted_files/document_convertedPNG_pg1.png
     converted_files/document_convertedPNG_pg2.png
     converted_files/document_convertedPNG_pg3.png
     ```
   - **Integration**: The paths of the converted PNG files are automatically added to the `files_to_process` list. This ensures seamless processing in subsequent steps.

3. **Processing**:
   - All PNG files, whether uploaded directly or generated from PDFs, are processed together in the final step.

#### Prerequisites
1. **Install Required Libraries**:
   Ensure the following Python libraries are installed:
   ```bash
   pip install pdf2image opencv-python-headless numpy
   ```
2. **Install Poppler for PDF Processing**:
   - **macOS (using Homebrew):**
     ```bash
     brew install poppler
     ```
   - **Linux (using APT):**
     ```bash
     sudo apt-get install poppler-utils
     ```
   - **Windows**:
     Download Poppler from [Poppler for Windows](https://blog.alivate.com.au/poppler-windows/), extract it, and add the `bin` folder to your system PATH.

#### Steps to Execute
1. **Run Cell 1**:
   - Input the file paths for the PNG and/or PDF files you want to process. Press Enter after each path. When finished, press Enter on an empty line.
2. **Run Cell 2**:
   - Converts all PDF files into PNGs. Converted PNGs are saved in a `converted_files` folder next to the respective PDFs.
   - All PNG paths (both directly uploaded and converted from PDFs) are added to `files_to_process`.
3. **Run Cell 3**:
   - Processes all PNG files in `files_to_process`.

#### Notes
- Ensure the file paths provided in **Cell 1** are correct.
- Any errors during processing will be logged in the console.
- The processed output (e.g., annotated PNGs) will be saved in designated folders based on your workflow.


In [None]:
from pdf2image import convert_from_path
import os

def convert_pdf_to_png(file_path, dpi=900):  # Default DPI set to 900
    """
    Converts a PDF file into PNG images, one per page, at 900 DPI resolution.
    Saves the images in a 'converted_files' directory next to the PDF.
    """
    # Ensure the output directory exists
    output_dir = os.path.join(os.path.dirname(file_path), "converted_files")
    os.makedirs(output_dir, exist_ok=True)

    # Extract the base name of the file (without extension)
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Convert PDF to a list of images at the specified DPI
    try:
        images = convert_from_path(file_path, dpi=dpi)
    except Exception as e:
        print(f"[ERROR] Could not convert PDF: {file_path}. Error: {e}")
        return []

    output_paths = []

    # Save each page as a PNG
    for page_num, image in enumerate(images, start=1):
        output_path = os.path.join(output_dir, f"{base_name}_convertedPNG_pg{page_num}.png")
        image.save(output_path, "PNG")
        output_paths.append(output_path)
        print(f"[INFO] Saved: {output_path}")

    return output_paths

def process_pdfs(file_paths):
    """
    Process multiple PDF files, convert their pages to PNG,
    and update the global list 'files_to_process' with the converted PNG paths.
    """
    if not file_paths:
        print("[INFO] No PDF files to process.")
        return

    print(f"[INFO] PDF files to process: {len(file_paths)}")
    all_converted_paths = []

    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"[ERROR] File not found: {file_path}")
            continue

        print(f"[INFO] Processing PDF: {file_path}")
        converted_paths = convert_pdf_to_png(file_path)
        all_converted_paths.extend(converted_paths)

    # Update the global 'files_to_process' to include the converted PNGs
    files_to_process.extend(all_converted_paths)

    print("\n[INFO] PDF conversion completed.")
    return all_converted_paths

# Filter out PDF files from the user-provided file list and process them
if 'files_to_process' in globals():
    pdf_files = [file for file in files_to_process if file.lower().endswith('.pdf')]
    process_pdfs(pdf_files)
else:
    print("[ERROR] 'files_to_process' not found. Please run Cell 1 first.")


In [None]:
import os
import cv2
import numpy as np

def process_image(image_path):
    """Process a single image to create the temp.png file."""
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Failed to load image: {image_path}")

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply binary thresholding
    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

    # Detect contours (these should correspond to boxes)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create a blank image to draw the contours (for visualization purposes)
    contour_image = np.zeros_like(image)
    cv2.drawContours(contour_image, contours, -1, (0, 255, 0), 2)

    flowchart_image = np.zeros_like(contour_image)

    # Iterate through each contour and apply filtering
    for contour in contours:
        # Calculate the contour's perimeter (arcLength) to identify small/dotted lines
        contour_length = cv2.arcLength(contour, True)
        
        # Get the bounding rectangle for each contour
        x, y, w, h = cv2.boundingRect(contour)
        
        # Set thresholds to filter out text and dotted lines
        min_contour_length = 200  # Adjust this value based on your image (for dotted lines)
        min_text_width = 20  # Minimum width for potential text (adjust based on image)
        min_text_height = 20  # Minimum height for potential text (adjust based on image)
        
        # Filter out small/dotted lines and text contours
        if contour_length > min_contour_length and (w > min_text_width and h > min_text_height):
            # Draw remaining contours (that are not filtered out)
            cv2.drawContours(flowchart_image, [contour], -1, (0, 0, 255), 5)

    # Get the directory of the selected image
    output_dir = os.path.join(os.path.dirname(image_path), "processed_files")
    os.makedirs(output_dir, exist_ok=True)

    # Create the output path
    output_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}_temp.png")

    # Save the temp image to the same location
    cv2.imwrite(output_path, flowchart_image)

    print(f"Temporary file saved at: {output_path}")
    return output_path

def process_files(file_paths):
    """Process multiple files to create temp images for all."""
    if not file_paths:
        print("[INFO] No files to process.")
        return

    print(f"[INFO] Files to process: {len(file_paths)}")
    temp_image_paths = []

    for file_path in file_paths:
        # Ensure only PNG files are processed
        if not file_path.lower().endswith('.png'):
            print(f"[WARNING] Skipping non-PNG file: {file_path}")
            continue

        if not os.path.exists(file_path):
            print(f"[ERROR] File not found: {file_path}")
            continue

        print(f"[INFO] Processing file: {file_path}")
        try:
            temp_path = process_image(file_path)
            temp_image_paths.append(temp_path)
        except Exception as e:
            print(f"[ERROR] Failed to process file {file_path}: {e}")

    print("\n[INFO] All files processed successfully.")
    return temp_image_paths

# Ensure this part is executed to process multiple files
if 'files_to_process' in globals():
    temp_files = process_files(files_to_process)
else:
    print("[ERROR] 'files_to_process' not found. Please run Cell 1 first.")


In [None]:
from paddleocr import PaddleOCR
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to check if a contour is a rectangle
def is_rectangle(approx):
    if len(approx) == 4:
        if cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            angles = []
            for i in range(4):
                p1, p2, p3 = approx[i][0], approx[(i+1) % 4][0], approx[(i+2) % 4][0]
                angle = np.arctan2(p2[1] - p1[1], p2[0] - p1[0]) - np.arctan2(p3[1] - p2[1], p3[0] - p2[0])
                angle = np.abs(np.degrees(angle))
                if angle > 180:
                    angle = 360 - angle
                angles.append(angle)
            return all(np.isclose(angle, 90, atol=10) for angle in angles)
    return False

def process_image(image_path):
    """Process a single temp.png image."""
    # Read the image
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Adjust Canny edge detection thresholds
    edges = cv2.Canny(gray, 50, 200, apertureSize=3)

    # Find contours
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Process contours
    rectangles = []
    for contour in contours:
        epsilon = 0.02 * cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if is_rectangle(approx):
            rectangles.append(approx)

    # Prepare to store text information
    rectangle_texts = []

    # Extract text from detected rectangles
    for i, rect in enumerate(rectangles):
        x, y, w, h = cv2.boundingRect(rect)

        # Crop the rectangle region
        cropped_img = img[y:y + h, x:x + w]

        try:
            # Perform OCR only within the detected rectangle
            ocr_results = ocr.ocr(cropped_img, cls=True)
            extracted_text = " ".join([line[1][0] for line in ocr_results[0]])

            # Store the text along with rectangle label
            rectangle_texts.append({"Rectangle No": i + 1, "Text": extracted_text})

        except Exception as e:
            rectangle_texts.append({"Rectangle No": i + 1, "Text": f"Error during OCR: {e}"})

    # Save extracted text to a file
    output_dir = "processed_files"
    os.makedirs(output_dir, exist_ok=True)
    text_output_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}_text.txt")
    with open(text_output_path, 'w') as f:
        for entry in rectangle_texts:
            f.write(f"Rectangle {entry['Rectangle No']}: {entry['Text']}\n")
    print(f"[INFO] Extracted text saved at: {text_output_path}")

    # Display extracted text for each rectangle
    for entry in rectangle_texts:
        print(f"Rectangle {entry['Rectangle No']}: {entry['Text']}")

    # Display the labeled rectangles for verification
    debug_img = img.copy()
    for i, rect in enumerate(rectangles):
        x, y, w, h = cv2.boundingRect(rect)
        cv2.rectangle(debug_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(debug_img, str(i + 1), (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 0), 3)

    # Save the labeled image
    labeled_image_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}_labeled.png")
    cv2.imwrite(labeled_image_path, debug_img)
    print(f"[INFO] Labeled image saved at: {labeled_image_path}")

    # Display the image with labeled rectangles
    plt.figure(figsize=(100, 100))
    plt.title("Detected Rectangles with Labels")
    plt.imshow(cv2.cvtColor(debug_img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

def process_files(temp_image_paths):
    """Process multiple temp images."""
    if not temp_image_paths:
        print("[INFO] No files to process.")
        return

    print(f"[INFO] Processing {len(temp_image_paths)} files...")
    for image_path in temp_image_paths:
        if not os.path.exists(image_path):
            print(f"[ERROR] File does not exist: {image_path}")
            continue

        print(f"[INFO] Processing file: {image_path}")
        process_image(image_path)

    print("[INFO] Processing completed for all files.")

# Ensure this part is executed to process multiple files
if 'temp_files' in globals():
    process_files(temp_files)
else:
    print("[ERROR] 'temp_files' not found. Please run the previous cell to generate temp images.")


In [None]:
from paddleocr import PaddleOCR
import cv2
import numpy as np
import matplotlib.pyplot as plt
import json
import os

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Global list to store paths of generated JSON files with rectangles
if 'rectangle_json_files' not in globals():
    rectangle_json_files = []

# Function to check if a contour is a rectangle
def is_rectangle(approx):
    if len(approx) == 4:
        if cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            angles = []
            for i in range(4):
                p1, p2, p3 = approx[i][0], approx[(i+1) % 4][0], approx[(i+2) % 4][0]
                angle = np.arctan2(p2[1] - p1[1], p2[0] - p1[0]) - np.arctan2(p3[1] - p2[1], p3[0] - p2[0])
                angle = np.abs(np.degrees(angle))
                if angle > 180:
                    angle = 360 - angle
                angles.append(angle)
            return all(np.isclose(angle, 90, atol=10) for angle in angles)
    return False

def process_file(original_image_path, temp_image_path):
    """Process a single file: extract text from temp and original images."""
    print(f"[INFO] Processing file: {original_image_path} with {temp_image_path}")

    # Load the images
    processed_img = cv2.imread(temp_image_path)
    original_img = cv2.imread(original_image_path)

    # Convert processed image to grayscale for contour detection
    gray = cv2.cvtColor(processed_img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 200, apertureSize=3)

    # Find contours in the processed image
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Detect rectangles
    rectangles = []
    for contour in contours:
        epsilon = 0.02 * cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if is_rectangle(approx):
            rectangles.append(approx)

    # Prepare to store text information along with coordinates
    rectangle_texts = []

    # Extract text from original image based on detected rectangles
    for i, rect in enumerate(rectangles):
        x, y, w, h = cv2.boundingRect(rect)

        # Crop the corresponding region from the original image
        cropped_img = original_img[y:y + h, x:x + w]

        try:
            # Perform OCR on the cropped region
            ocr_results = ocr.ocr(cropped_img, cls=True)
            extracted_text = " ".join([line[1][0] for line in ocr_results[0]])

            # Store text with rectangle label and coordinates
            rectangle_texts.append({
                "Rectangle No": i + 1,
                "Text": extracted_text,
                "Coordinates": [(x, y), (x + w, y), (x + w, y + h), (x, y + h)]  # Storing the 4 corners
            })

        except Exception as e:
            rectangle_texts.append({
                "Rectangle No": i + 1,
                "Text": f"Error during OCR: {e}",
                "Coordinates": []  # In case OCR fails, no coordinates are stored
            })

    # Save the results to a JSON file
    output_dir = "processed_files"  # Create the processed_files folder directly in the current working directory
    os.makedirs(output_dir, exist_ok=True)
    json_output_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(original_image_path))[0]}_text.json")
    output_data = {"rectangles": rectangle_texts}
    with open(json_output_path, "w") as outfile:
        json.dump(output_data, outfile, indent=4)
    print(f"[INFO] Extracted text and coordinates saved at: {json_output_path}")

    # Add the JSON file path to the global rectangle_json_files list
    rectangle_json_files.append(json_output_path)

    # Display the labelled rectangles on the processed image for verification
    debug_img = processed_img.copy()
    for i, rect in enumerate(rectangles):
        x, y, w, h = cv2.boundingRect(rect)
        cv2.rectangle(debug_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(debug_img, str(i + 1), (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 0), 3)

    # Save the labelled image
    labeled_image_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(original_image_path))[0]}_labeled.png")
    cv2.imwrite(labeled_image_path, debug_img)
    print(f"[INFO] Labeled image saved at: {labeled_image_path}")

    # Display the image with labeled rectangles
    plt.figure(figsize=(100, 100))
    plt.title("Labelled Rectangles on Processed Image")
    plt.imshow(cv2.cvtColor(debug_img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

    # Print the extracted text and coordinates for each rectangle
    print("Extracted Text and Coordinates from Original Image:")
    for entry in rectangle_texts:
        print(f"Rectangle {entry['Rectangle No']}: {entry['Text']}")
        print(f"Coordinates: {entry['Coordinates']}")

def process_files(original_image_paths, temp_image_paths):
    """Process multiple files using their respective original and temp images."""
    if not original_image_paths or not temp_image_paths:
        print("[INFO] No files to process.")
        return

    for original_path, temp_path in zip(original_image_paths, temp_image_paths):
        if not os.path.exists(original_path) or not os.path.exists(temp_path):
            print(f"[ERROR] File not found: {original_path} or {temp_path}")
            continue

        process_file(original_path, temp_path)

    print("[INFO] Processing completed for all files.")
    print(f"[INFO] All generated JSON files with rectangles: {rectangle_json_files}")

# Ensure this part is executed to process multiple files
if 'files_to_process' in globals() and 'temp_files' in globals():
    process_files(files_to_process, temp_files)
else:
    print("[ERROR] 'files_to_process' or 'temp_files' not found. Please run previous cells to generate the required data.")


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

# Global variables
processed_file_results = []
detected_line_files = []
debug_images = []

# Function to check if a contour is a rectangle
def is_rectangle(approx):
    """Check if a contour approximates a rectangle."""
    if len(approx) == 4:
        if cv2.isContourConvex(approx):
            x, y, w, h = cv2.boundingRect(approx)
            angles = []
            for i in range(4):
                p1, p2, p3 = approx[i][0], approx[(i+1) % 4][0], approx[(i+2) % 4][0]
                angle = np.arctan2(p2[1] - p1[1], p2[0] - p1[0]) - np.arctan2(p3[1] - p2[1], p3[0] - p2[0])
                angle = np.abs(np.degrees(angle))
                if angle > 180:
                    angle = 360 - angle
                angles.append(angle)
            return all(np.isclose(angle, 90, atol=10) for angle in angles)
    return False

def process_file(image_path):
    """Process a single image file to detect lines and rectangles."""
    img = cv2.imread(image_path)
    if img is None:
        print(f"[ERROR] Image not found: {image_path}")
        return None

    # Extract the base name of the file
    base_name = os.path.splitext(os.path.basename(image_path))[0]

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply edge detection
    edges = cv2.Canny(gray, 50, 200, apertureSize=3)

    # Find contours
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create blank images for masks and debug visualization
    mask = np.zeros(img.shape[:2], dtype=np.uint8)
    debug_img = img.copy()

    # Detect rectangles
    rectangles = []
    for contour in contours:
        epsilon = 0.02 * cv2.arcLength(contour, True)
        approx = cv2.approxPolyDP(contour, epsilon, True)
        if is_rectangle(approx):
            rectangles.append(approx)

    # Draw rectangles on debug image and mask
    for i, rect in enumerate(rectangles):
        cv2.drawContours(mask, [rect], 0, (255), -1)
        cv2.drawContours(debug_img, [rect], 0, (0, 255, 0), 4)

        # Label each rectangle
        M = cv2.moments(rect)
        if M["m00"] != 0:
            cX = int(M["m10"] / M["m00"])
            cY = int(M["m01"] / M["m00"])
        else:
            cX, cY = 0, 0
        cv2.putText(debug_img, str(i+1), (cX, cY), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 255, 0), 3)

    # Create an inverted mask
    inverted_mask = cv2.bitwise_not(mask)

    # Remove rectangles using the inverted mask
    result_no_rectangles = cv2.bitwise_and(img, img, mask=inverted_mask)

    # Fine-tune the mask
    kernel = np.ones((10, 10), np.uint8)
    dilated_mask = cv2.dilate(mask, kernel, iterations=2)
    inverted_dilated_mask = cv2.bitwise_not(dilated_mask)
    result_clean = cv2.bitwise_and(img, img, mask=inverted_dilated_mask)

    # Detect edges in the cleaned image
    gray_clean = cv2.cvtColor(result_clean, cv2.COLOR_BGR2GRAY)
    blurred = cv2.GaussianBlur(gray_clean, (5, 5), 0)
    edges_clean = cv2.Canny(blurred, 50, 150, apertureSize=3)

    # Detect lines using Hough Line Transform
    lines = cv2.HoughLinesP(edges_clean, rho=1, theta=np.pi / 180, threshold=30, minLineLength=30, maxLineGap=20)

    # Draw detected lines
    line_img = result_clean.copy()
    if lines is not None:
        print(f"[INFO] Number of lines detected: {len(lines)}")
        for idx, line in enumerate(lines):
            x1, y1, x2, y2 = line[0]
            cv2.line(line_img, (x1, y1), (x2, y2), (0, 0, 255), 3)
    else:
        print("[INFO] No lines detected.")

    # Save results
    output_dir = "processed_files"  # Create the processed_files folder in the current directory
    os.makedirs(output_dir, exist_ok=True)

    debug_path = os.path.join(output_dir, f"{base_name}_debug_rectangles.png")
    no_rectangles_path = os.path.join(output_dir, f"{base_name}_no_rectangles.png")
    cleaned_path = os.path.join(output_dir, f"{base_name}_cleaned.png")
    lines_detected_path = os.path.join(output_dir, f"{base_name}_lines_detected.png")

    cv2.imwrite(debug_path, debug_img)
    cv2.imwrite(no_rectangles_path, result_no_rectangles)
    cv2.imwrite(cleaned_path, result_clean)
    cv2.imwrite(lines_detected_path, line_img)

    # Store paths in global variables
    processed_file_results.append({
        "file": base_name,
        "debug_rectangles": debug_path,
        "no_rectangles": no_rectangles_path,
        "cleaned": cleaned_path,
        "lines_detected": lines_detected_path,
    })
    debug_images.append(debug_img)

    # Display results
    fig, axs = plt.subplots(2, 2, figsize=(20, 20))
    fig.suptitle(f"Results for {base_name}", fontsize=16)

    axs[0, 0].imshow(cv2.cvtColor(debug_img, cv2.COLOR_BGR2RGB))
    axs[0, 0].set_title("Debug Rectangles")
    axs[0, 0].axis('off')

    axs[0, 1].imshow(cv2.cvtColor(result_no_rectangles, cv2.COLOR_BGR2RGB))
    axs[0, 1].set_title("Image Without Rectangles")
    axs[0, 1].axis('off')

    axs[1, 0].imshow(cv2.cvtColor(result_clean, cv2.COLOR_BGR2RGB))
    axs[1, 0].set_title("Cleaned Image")
    axs[1, 0].axis('off')

    axs[1, 1].imshow(cv2.cvtColor(line_img, cv2.COLOR_BGR2RGB))
    axs[1, 1].set_title("Detected Lines")
    axs[1, 1].axis('off')

    plt.tight_layout()
    plt.show()

    print(f"[INFO] Debug image saved at {debug_path}")

def process_files(temp_image_paths):
    """Process multiple image files."""
    global detected_line_files
    if not temp_image_paths:
        print("[INFO] No files to process.")
        return

    for image_path in temp_image_paths:
        if not os.path.exists(image_path):
            print(f"[ERROR] File not found: {image_path}")
            continue
        process_file(image_path)

    detected_line_files = [result["lines_detected"] for result in processed_file_results]
    print(f"[INFO] Processed {len(detected_line_files)} files.")

# Ensure the function is executed
if 'temp_files' in globals():
    process_files(temp_files)
else:
    print("[ERROR] 'temp_files' not found. Please provide a valid list of image paths.")


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import json

# Helper function to calculate the distance between two points
def distance(p1, p2):
    return np.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)

# Helper function to calculate the midpoint of a line
def midpoint(p1, p2):
    return ((p1[0] + p2[0]) // 2, (p1[1] + p2[1]) // 2)

# Helper function to check if two points are close
def points_are_close(p1, p2, distance_thresh=10):
    return distance(p1, p2) < distance_thresh

# Update function to combine lines based on both proximity and overlapping points
def should_combine(line1, line2, distance_thresh=10):
    if (points_are_close(line1[0], line2[0], distance_thresh) or 
        points_are_close(line1[1], line2[1], distance_thresh) or 
        points_are_close(line1[0], line2[1], distance_thresh) or 
        points_are_close(line1[1], line2[0], distance_thresh)):
        return True
    return False

# Function to combine two lines
def combine_lines(line1, line2):
    points = [line1[0], line1[1], line2[0], line2[1]]
    max_dist = 0
    start_point = points[0]
    end_point = points[1]
    for i in range(len(points)):
        for j in range(i + 1, len(points)):
            dist = distance(points[i], points[j])
            if dist > max_dist:
                max_dist = dist
                start_point = points[i]
                end_point = points[j]
    return (start_point, end_point)

# Global list to store JSON file paths
if 'json_files' not in globals():
    json_files = []

def process_file(original_image_path):
    """Process a single file to detect and combine lines."""
    img = cv2.imread(original_image_path)
    if img is None:
        print(f"[ERROR] Image not found: {original_image_path}")
        return None

    base_name = os.path.splitext(os.path.basename(original_image_path))[0]
    gray_clean = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges_clean = cv2.Canny(gray_clean, 10, 150, apertureSize=7, L2gradient=True)

    lines = cv2.HoughLinesP(edges_clean, rho=1, theta=np.pi / 180, threshold=10, minLineLength=5, maxLineGap=15)

    line_img = img.copy()
    line_points = []
    if lines is not None:
        for idx, line in enumerate(lines):
            x1, y1, x2, y2 = map(int, line[0])
            cv2.line(line_img, (x1, y1), (x2, y2), (0, 255, 255), 2)
            line_points.append(((x1, y1), (x2, y2)))
            mid_x, mid_y = (x1 + x2) // 2, (y1 + y2) // 2
            cv2.putText(line_img, f'{idx+1}', (mid_x, mid_y), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 3, cv2.LINE_AA)

    combined_lines = []
    for line1 in line_points:
        merged = False
        for i, line2 in enumerate(combined_lines):
            if should_combine(line1, line2):
                combined_lines[i] = combine_lines(line1, line2)
                merged = True
                break
        if not merged:
            combined_lines.append(line1)

    line_img_combined = img.copy()
    for idx, line in enumerate(combined_lines):
        x1, y1 = line[0]
        x2, y2 = line[1]
        cv2.line(line_img_combined, (x1, y1), (x2, y2), (255, 0, 0), 2)
        mid_x, mid_y = (x1 + x2) // 2, (y1 + y2) // 2
        cv2.putText(line_img_combined, f'{idx+1}', (mid_x, mid_y), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 3, cv2.LINE_AA)

    json_output_path = os.path.join(os.path.dirname(original_image_path), f"{base_name}_lines.json")
    line_data = {
        "lines": [{"start": tuple(map(int, line[0])), "end": tuple(map(int, line[1]))} for line in line_points],
        "combined_lines": [{"start": tuple(map(int, line[0])), "end": tuple(map(int, line[1]))} for line in combined_lines]
    }
    with open(json_output_path, "w") as f:
        json.dump(line_data, f, indent=4)
    print(f"[INFO] Line data saved at: {json_output_path}")

    # Append JSON file path to the global list
    json_files.append(json_output_path)

    cv2.imwrite(os.path.join(os.path.dirname(original_image_path), f"{base_name}_lines.png"), line_img)
    cv2.imwrite(os.path.join(os.path.dirname(original_image_path), f"{base_name}_combined_lines.png"), line_img_combined)

    fig, axs = plt.subplots(1, 2, figsize=(96, 64))
    fig.suptitle(f"Line Detection for {base_name}", fontsize=36)

    axs[0].imshow(cv2.cvtColor(line_img, cv2.COLOR_BGR2RGB))
    axs[0].set_title("Detected Lines")
    axs[0].axis('off')

    axs[1].imshow(cv2.cvtColor(line_img_combined, cv2.COLOR_BGR2RGB))
    axs[1].set_title("Combined Lines")
    axs[1].axis('off')

    plt.tight_layout()
    plt.show()

    print(f"Detected {len(combined_lines)} combined lines.")
    for idx, points in enumerate(combined_lines):
        print(f"Combined Line {idx+1}: Start {points[0]}, End {points[1]}")

def process_files(image_paths):
    """Process multiple files for line detection."""
    if not image_paths:
        print("[INFO] No files to process.")
        return

    for image_path in image_paths:
        if not os.path.exists(image_path):
            print(f"[ERROR] File not found: {image_path}")
            continue

        process_file(image_path)

    print("[INFO] Line detection completed for all files.")
    print(f"[INFO] JSON files generated: {json_files}")

if 'detected_line_files' in globals():
    process_files(detected_line_files)
else:
    print("[ERROR] 'detected_line_files' not found. Please run the previous cell to generate detected lines images.")


In [None]:
# Debugging code to check files in temp_files and json_files
if 'temp_files' in globals():
    print("[INFO] Checking files in temp_files...")
    for file in temp_files:
        if os.path.exists(file):
            print(f"[FOUND] {file}")
        else:
            print(f"[MISSING] {file}")
else:
    print("[ERROR] 'temp_files' is not defined.")

if 'json_files' in globals():
    print("[INFO] Checking files in json_files...")
    for file in json_files:
        if os.path.exists(file):
            print(f"[FOUND] {file}")
        else:
            print(f"[MISSING] {file}")
else:
    print("[ERROR] 'json_files' is not defined.")


In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import json

# Helper function to calculate the distance between two points
def distance(p1, p2):
    return np.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)

# Helper function to calculate the midpoint of a line
def midpoint(p1, p2):
    return ((p1[0] + p2[0]) // 2, (p1[1] + p2[1]) // 2)

# Helper function to check if two points are close
def points_are_close(p1, p2, distance_thresh=10):
    return distance(p1, p2) < distance_thresh

# Update function to combine lines based on both proximity and overlapping points
def should_combine(line1, line2, distance_thresh=10):
    if (points_are_close(line1[0], line2[0], distance_thresh) or 
        points_are_close(line1[1], line2[1], distance_thresh) or 
        points_are_close(line1[0], line2[1], distance_thresh) or 
        points_are_close(line1[1], line2[0], distance_thresh)):
        return True
    return False

# Function to combine two lines
def combine_lines(line1, line2):
    points = [line1[0], line1[1], line2[0], line2[1]]
    
    # Initialize variables to track the maximum distance and the corresponding points
    max_dist = 0
    start_point = points[0]
    end_point = points[1]
    
    # Iterate over all pairs of points to find the pair with the maximum distance
    for i in range(len(points)):
        for j in range(i + 1, len(points)):
            dist = distance(points[i], points[j])
            if dist > max_dist:
                max_dist = dist
                start_point = points[i]
                end_point = points[j]
    
    # Return the start and end points that form the combined line
    return (start_point, end_point)

# Function to process a single temp image and generate combined lines and labels
def process_file(temp_image_path, json_file_path):
    img = cv2.imread(temp_image_path)
    if img is None:
        print(f"[ERROR] Image not found: {temp_image_path}")
        return None

    # Extract the base name for saving results
    base_name = os.path.splitext(os.path.basename(temp_image_path))[0]

    # Read the detected lines from the JSON file
    with open(json_file_path, "r") as f:
        line_data = json.load(f)
    
    # Extract lines from the JSON file
    line_points = [(tuple(line["start"]), tuple(line["end"])) for line in line_data["lines"]]

    # Create images for visualization
    line_img = img.copy()

    # Store detected lines (draw them on the debug image)
    for idx, (start, end) in enumerate(line_points):
        x1, y1 = start
        x2, y2 = end
        cv2.line(line_img, (x1, y1), (x2, y2), (0, 255, 255), 2)  # Draw detected lines
        # Annotate line number
        mid_x, mid_y = (x1 + x2) // 2, (y1 + y2) // 2
        cv2.putText(line_img, f'{idx+1}', (mid_x, mid_y), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 3, cv2.LINE_AA)

    # Combine lines based on proximity
    combined_lines = []
    for line1 in line_points:
        merged = False
        for i, line2 in enumerate(combined_lines):
            if should_combine(line1, line2):
                combined_lines[i] = combine_lines(line1, line2)
                merged = True
                break
        if not merged:
            combined_lines.append(line1)

    # Create a copy of the cleaned image to draw combined lines and labels
    line_img_combined = img.copy()

    # Draw combined lines and add labels
    for idx, line in enumerate(combined_lines):
        x1, y1 = line[0]
        x2, y2 = line[1]
        cv2.line(line_img_combined, (x1, y1), (x2, y2), (255, 255, 0), 2)  # Draw blue lines for combined lines
        # Get midpoint to place label
        mid_x, mid_y = midpoint((x1, y1), (x2, y2))
        # Label the line with its number
        cv2.putText(line_img_combined, f'{idx+1}', (mid_x, mid_y), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)

    # Save the visualized image for combined lines
    combined_path = os.path.join(os.path.dirname(temp_image_path), f"{base_name}_combined_lines.png")
    cv2.imwrite(combined_path, line_img_combined)

    # Display the image for combined lines
    plt.figure(figsize=(96, 48))
    plt.title(f"Combined Lines for {base_name}")
    plt.imshow(cv2.cvtColor(line_img_combined, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

    # Output the start and end points of the combined lines with labels
    print(f"Detected {len(combined_lines)} combined lines.")
    for idx, points in enumerate(combined_lines):
        print(f"Combined Line {idx+1}: Start {points[0]}, End {points[1]}")

# Function to process all temp images (temp_files) and their respective JSON files
def process_files(temp_files, json_files):
    """Process multiple temp image files for line detection and combination."""
    if not temp_files or not json_files:
        print("[INFO] No files to process.")
        return

    for temp_image_path, json_file_path in zip(temp_files, json_files):
        if not os.path.exists(temp_image_path):
            print(f"[ERROR] File not found: {temp_image_path}")
            continue
        if not os.path.exists(json_file_path):
            print(f"[ERROR] JSON file not found: {json_file_path}")
            continue
        process_file(temp_image_path, json_file_path)

    print("[INFO] Line detection and combination completed for all files.")

# Ensure this part is executed to process multiple files
if 'temp_files' in globals() and 'json_files' in globals():
    process_files(temp_files, json_files)
else:
    print("[ERROR] 'temp_files' or 'json_files' not found. Please provide valid lists of temp image paths and JSON files.")


In [None]:
import json
import os

def find_processed_files_directory(file_paths):
    """
    Determines the location of the 'processed_files' directory based on the parent folder of the first file in the list.
    Assumes 'processed_files' is at the same level as the parent folder of the input files.
    """
    if not file_paths:
        print("[ERROR] No files provided to locate 'processed_files' directory.")
        return None

    # Get the parent directory of the first file's directory
    parent_folder = os.path.dirname(os.path.dirname(file_paths[0]))
    processed_files_path = os.path.join(parent_folder, "processed_files")

    # Check if the directory exists
    if os.path.exists(processed_files_path):
        return processed_files_path

    print("[ERROR] 'processed_files' directory not found.")
    return None

def filter_and_update_rectangles_for_files(file_paths):
    """
    Iterates over each file in `files_to_process` and:
      1) Filters out rectangles with OCR errors.
      2) Updates the corresponding rectangle_text_mapping.json file dynamically.
    """
    # Locate the 'processed_files' directory dynamically
    processed_files_directory = find_processed_files_directory(file_paths)
    if not processed_files_directory:
        print("[ERROR] Unable to proceed without 'processed_files' directory.")
        return

    print(f"[INFO] Located 'processed_files' directory: {processed_files_directory}")

    for original_image_path in file_paths:
        # Skip non-PNG files
        if not original_image_path.lower().endswith('.png'):
            print(f"[INFO] Skipping unsupported file type: {original_image_path}")
            continue

        base_filename = os.path.splitext(os.path.basename(original_image_path))[0]

        # Dynamically construct the JSON file path
        json_path = os.path.join(processed_files_directory, f"{base_filename}_text.json")

        if not os.path.exists(json_path):
            print(f"[ERROR] JSON file not found: {json_path}. Skipping this file.")
            continue

        print(f"[INFO] Processing rectangles for file: {json_path}")

        # Load the existing JSON data
        with open(json_path, "r") as infile:
            existing_data = json.load(infile)

        # Ensure the rectangles and texts are defined
        rectangles = existing_data.get("rectangles", [])
        if not rectangles:
            print(f"[INFO] No rectangles found in {json_path}. Skipping.")
            continue

        # Filter out rectangles with OCR errors
        filtered_rectangles = []
        for text_data in rectangles:
            if "Error during OCR" not in text_data["Text"]:
                filtered_rectangles.append(text_data)

        # Display filtered rectangles and associated text
        print("Filtered Rectangles and Texts:")
        for text_data in filtered_rectangles:
            print(f"Rectangle {text_data['Rectangle No']}: {text_data['Text']}")

        print(f"Number of rectangles after filtering: {len(filtered_rectangles)}")

        # Update the JSON data
        existing_data["rectangles"] = filtered_rectangles

        # Save the updated JSON file
        with open(json_path, "w") as outfile:
            json.dump(existing_data, outfile, indent=4)

        print(f"[SUCCESS] Filtered rectangle texts updated in '{json_path}'\n")

# Example usage:
filter_and_update_rectangles_for_files(files_to_process)


In [None]:
import json
import os

# Debug function to check for 'rectangles' key in all JSON files stored in rectangle_json_files
def debug_check_rectangle_json_files():
    """
    Debug function to check the 'rectangles' key in all JSON files stored in rectangle_json_files.
    """
    # Check if rectangle_json_files exists and is not empty
    if 'rectangle_json_files' not in globals() or not rectangle_json_files:
        print("[ERROR] 'rectangle_json_files' is not defined or is empty. Please run the processing step first.")
        return

    print("[INFO] Starting debug for rectangle JSON files...")
    for json_file in rectangle_json_files:
        # Ensure the file exists
        if not os.path.exists(json_file):
            print(f"[ERROR] File not found: {json_file}")
            continue

        print(f"[INFO] Inspecting file: {json_file}")
        try:
            # Open and load JSON data
            with open(json_file, "r") as f:
                data = json.load(f)
            
            # Check for 'rectangles' key
            if "rectangles" in data:
                print(f"[SUCCESS] 'rectangles' key found in: {json_file}")
                print(f"[INFO] Number of rectangles: {len(data['rectangles'])}")
                
                # Print details of the first rectangle (optional)
                if data["rectangles"]:
                    print(f"[INFO] Example rectangle data: {data['rectangles'][0]}")
            else:
                print(f"[INFO] 'rectangles' key NOT found in: {json_file}")
                print(f"[INFO] Available keys in this file: {list(data.keys())}")
        
        except json.JSONDecodeError:
            print(f"[ERROR] Failed to decode JSON in file: {json_file}")
        except Exception as e:
            print(f"[ERROR] Unexpected error processing {json_file}: {e}")

# Example usage
if 'rectangle_json_files' in globals():
    debug_check_rectangle_json_files()
else:
    print("[ERROR] 'rectangle_json_files' is not defined. Please ensure the JSON files are generated and stored.")


In [None]:
import os
import json
import numpy as np

def debug_environment():
    """
    Debug function to validate the existence and contents of required variables and files.
    """
    errors = []

    # Check if 'rectangle_json_files' is defined and populated
    if 'rectangle_json_files' not in globals():
        errors.append("[ERROR] 'rectangle_json_files' variable is not defined.")
    elif not rectangle_json_files:
        errors.append("[ERROR] 'rectangle_json_files' is empty. Please ensure rectangle data is processed.")
    else:
        print("[INFO] 'rectangle_json_files' is defined and contains the following files:")
        for file in rectangle_json_files:
            if os.path.exists(file):
                print(f"[INFO] Found: {file}")
            else:
                errors.append(f"[ERROR] File not found: {file}")

    # Check if 'json_files' is defined and populated (for line coordinates)
    if 'json_files' not in globals():
        errors.append("[ERROR] 'json_files' variable is not defined.")
    elif not json_files:
        errors.append("[ERROR] 'json_files' is empty. Please ensure line data is processed.")
    else:
        print("[INFO] 'json_files' is defined and contains the following files:")
        for file in json_files:
            if os.path.exists(file):
                print(f"[INFO] Found: {file}")
            else:
                errors.append(f"[ERROR] File not found: {file}")

    # Summary of errors
    if errors:
        print("\n".join(errors))
    else:
        print("[SUCCESS] All required variables and files are correctly defined and exist.")

def validate_json_files(json_file_list, required_keys):
    """
    Validate the contents of a list of JSON files to ensure they contain required keys.
    """
    for file in json_file_list:
        if not os.path.exists(file):
            print(f"[ERROR] File not found: {file}")
            continue

        try:
            with open(file, "r") as f:
                data = json.load(f)
            
            missing_keys = [key for key in required_keys if key not in data]
            if missing_keys:
                print(f"[ERROR] Missing keys {missing_keys} in file: {file}")
            else:
                print(f"[INFO] File {file} contains all required keys: {required_keys}")
        except json.JSONDecodeError:
            print(f"[ERROR] Failed to decode JSON in file: {file}")
        except Exception as e:
            print(f"[ERROR] Unexpected error while reading {file}: {e}")

# Run the debug environment check
print("[DEBUG] Validating environment...")
debug_environment()

# Validate rectangle JSON files
if 'rectangle_json_files' in globals() and rectangle_json_files:
    print("\n[DEBUG] Validating rectangle JSON files for required keys...")
    validate_json_files(rectangle_json_files, required_keys=["rectangles"])

# Validate line JSON files
if 'json_files' in globals() and json_files:
    print("\n[DEBUG] Validating line JSON files for required keys...")
    validate_json_files(json_files, required_keys=["combined_lines"])


In [None]:
import numpy as np
import json
import os

# Global variable to store adjacency matrix file paths
adjacency_matrix_files = []

def point_to_line_distance(line_start, line_end, point):
    """
    Calculate the distance from a point to a line segment.
    """
    line_start = np.array(line_start).reshape(2)
    line_end = np.array(line_end).reshape(2)
    point = np.array(point).reshape(2)

    line_vec = line_end - line_start
    point_vec = point - line_start

    line_len = np.linalg.norm(line_vec)
    if line_len == 0:
        return np.linalg.norm(point_vec)  # Handle degenerate line (start == end)

    line_unitvec = line_vec / line_len
    projection_length = np.dot(point_vec, line_unitvec)

    if projection_length < 0:
        closest_point = line_start
    elif projection_length > line_len:
        closest_point = line_end
    else:
        closest_point = line_start + projection_length * line_unitvec

    return np.linalg.norm(point - closest_point)

def is_point_close_to_rectangle(point, rectangle, threshold):
    """
    Check if a point is close to any edge of a rectangle.
    """
    if len(rectangle) != 4:
        raise ValueError("Rectangle must be a list of 4 points.")

    edges = [
        (rectangle[0], rectangle[1]),
        (rectangle[1], rectangle[2]),
        (rectangle[2], rectangle[3]),
        (rectangle[3], rectangle[0])
    ]

    for edge_start, edge_end in edges:
        if point_to_line_distance(edge_start, edge_end, point) < threshold:
            return True
    return False

def compute_adjacency_matrix_via_lines(rectangle_json_path, line_json_path, threshold=30):
    """
    Computes adjacency matrix for rectangles based on proximity to line endpoints
    and saves it to a JSON file.
    """
    global adjacency_matrix_files  # Use global to store paths dynamically

    # Load rectangle data
    with open(rectangle_json_path, "r") as rect_file:
        rectangle_data = json.load(rect_file)
    
    rectangles = rectangle_data.get("rectangles", [])
    if not rectangles:
        print(f"[ERROR] No 'rectangles' key found in {rectangle_json_path}.")
        return

    rectangle_coords = [rect["Coordinates"] for rect in rectangles if rect["Coordinates"]]
    num_rectangles = len(rectangle_coords)

    if num_rectangles == 0:
        print(f"[ERROR] No valid rectangle coordinates found in {rectangle_json_path}.")
        return

    # Load line data
    with open(line_json_path, "r") as line_file:
        line_data = json.load(line_file)

    lines = line_data.get("combined_lines", [])
    if not lines:
        print(f"[ERROR] No 'combined_lines' key found in {line_json_path}.")
        return

    print(f"[INFO] Number of rectangles: {num_rectangles}")
    print(f"[INFO] Number of lines: {len(lines)}")

    # Initialize adjacency matrix
    adjacency_matrix = np.zeros((num_rectangles, num_rectangles), dtype=bool)

    # Check each line's proximity to rectangles
    for line in lines:
        line_start, line_end = line["start"], line["end"]
        start_close_to_rect = -1
        end_close_to_rect = -1

        # Check if line start is close to any rectangle
        for i, rectangle in enumerate(rectangle_coords):
            try:
                if is_point_close_to_rectangle(line_start, rectangle, threshold):
                    start_close_to_rect = i
            except ValueError as e:
                print(f"Skipping invalid rectangle at index {i}: {e}")

        # Check if line end is close to any rectangle
        for i, rectangle in enumerate(rectangle_coords):
            try:
                if is_point_close_to_rectangle(line_end, rectangle, threshold):
                    end_close_to_rect = i
            except ValueError as e:
                print(f"Skipping invalid rectangle at index {i}: {e}")

        # Mark adjacency if line connects two different rectangles
        if start_close_to_rect != -1 and end_close_to_rect != -1 and start_close_to_rect != end_close_to_rect:
            adjacency_matrix[start_close_to_rect, end_close_to_rect] = True
            adjacency_matrix[end_close_to_rect, start_close_to_rect] = True  # Symmetrical adjacency

    # Save adjacency matrix to a JSON file
    adjacency_output_path = os.path.join(
        os.path.dirname(rectangle_json_path),
        f"{os.path.splitext(os.path.basename(rectangle_json_path))[0]}_adjacency_matrix.json"
    )
    with open(adjacency_output_path, "w") as outfile:
        json.dump(adjacency_matrix.tolist(), outfile, indent=4)

    print(f"[SUCCESS] Adjacency matrix saved to: {adjacency_output_path}\n")

    # Add the adjacency matrix file path to the global list
    adjacency_matrix_files.append(adjacency_output_path)

def compute_adjacency_matrices_via_lines(rectangle_json_files, line_json_files, threshold=30):
    """
    Computes adjacency matrices for multiple JSON files containing rectangle data and lines.
    """
    if not rectangle_json_files or not line_json_files:
        print("[ERROR] Either 'rectangle_json_files' or 'line_json_files' is empty.")
        return

    # Ensure equal pairing of rectangle and line files
    if len(rectangle_json_files) != len(line_json_files):
        print(f"[ERROR] Mismatch in number of rectangle and line files.")
        print(f"Rectangle files: {len(rectangle_json_files)}, Line files: {len(line_json_files)}")
        return

    for rect_file, line_file in zip(rectangle_json_files, line_json_files):
        if not os.path.exists(rect_file):
            print(f"[ERROR] Rectangle file not found: {rect_file}. Skipping.")
            continue
        if not os.path.exists(line_file):
            print(f"[ERROR] Line file not found: {line_file}. Skipping.")
            continue

        print(f"[INFO] Processing rectangle file: {rect_file} with line file: {line_file}")
        compute_adjacency_matrix_via_lines(rect_file, line_file, threshold=threshold)

    # Display all collected adjacency matrix files
    print("\n[INFO] All generated adjacency matrix files:")
    for file in adjacency_matrix_files:
        print(f" - {file}")

# Usage Example
if 'rectangle_json_files' in globals() and rectangle_json_files and 'json_files' in globals() and json_files:
    print("[INFO] Using dynamically generated rectangle and line JSON files.")
    compute_adjacency_matrices_via_lines(rectangle_json_files, json_files, threshold=30)
else:
    print("[ERROR] Required JSON files or variables are missing. Ensure all data is generated.")


In [None]:
import numpy as np
import os
import json

def process_connections_for_files(rectangle_json_files, adjacency_matrix_files):
    """
    Processes multiple files to generate connections based on adjacency matrices
    and rectangle indices.
    """
    if len(rectangle_json_files) != len(adjacency_matrix_files):
        raise ValueError("Mismatch between the number of rectangle JSON files and adjacency matrix files.")

    all_connections = {}  # Store connections for all files

    for rect_file, adj_file in zip(rectangle_json_files, adjacency_matrix_files):
        # Validate files exist
        if not os.path.exists(rect_file):
            print(f"[ERROR] Rectangle file not found: {rect_file}. Skipping.")
            continue
        if not os.path.exists(adj_file):
            print(f"[ERROR] Adjacency matrix file not found: {adj_file}. Skipping.")
            continue

        # Load rectangles
        with open(rect_file, "r") as f:
            rectangle_data = json.load(f)
        rectangles = rectangle_data.get("rectangles", [])
        num_rectangles = len(rectangles)

        if num_rectangles == 0:
            print(f"[ERROR] No rectangles found in file: {rect_file}. Skipping.")
            continue

        # Load adjacency matrix
        with open(adj_file, "r") as f:
            adjacency_matrix = np.array(json.load(f), dtype=bool)

        # Validate adjacency matrix size
        if adjacency_matrix.shape[0] != num_rectangles:
            print(f"[ERROR] Mismatch between number of rectangles and adjacency matrix size for file: {rect_file}. Skipping.")
            continue

        # Initialize connections dictionary for the current file
        connections = {}

        # Extract edges from the adjacency matrix
        for i in range(num_rectangles):
            source_label = f"Rectangle {i+1}"
            if source_label not in connections:
                connections[source_label] = []  # Initialize an empty list for connections

            for j in range(num_rectangles):
                if adjacency_matrix[i, j]:  # If there's a connection between rectangle i and j
                    target_label = f"Rectangle {j+1}"
                    connections[source_label].append(target_label)

        # Add connections for the current file to the global dictionary
        file_key = os.path.basename(rect_file).replace("_text.json", "")
        all_connections[file_key] = connections

    # Display all connections
    print("\nList of Connections for All Files:")
    for file, connections in all_connections.items():
        print(f"\nConnections for {file}:")
        for source, targets in connections.items():
            if targets:
                targets_str = ', '.join(targets)
                print(f"  {source} is linked with: {targets_str}")
            else:
                print(f"  {source} has no connections.")

    return all_connections  # Return the aggregated connections for further use

# Example Usage
if 'rectangle_json_files' in globals() and 'adjacency_matrix_files' in globals():
    connections = process_connections_for_files(rectangle_json_files, adjacency_matrix_files)
else:
    print("[ERROR] Required files (rectangle_json_files or adjacency_matrix_files) are not available.")


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import json
import os

def plot_adjacency_matrix_with_labels(adjacency_matrix, labels, title):
    """
    Plot the adjacency matrix with custom labels and save/display it.
    """
    num_rectangles = adjacency_matrix.shape[0]
    
    # Validate that the number of labels matches the adjacency matrix size
    if num_rectangles != len(labels):
        raise ValueError("Number of labels does not match the dimensions of the adjacency matrix.")
    
    plt.figure(figsize=(120, 120))  # Increased the figure size

    # Create the matrix plot
    plt.matshow(adjacency_matrix, cmap='Blues', fignum=1)

    # Add title and labels
    plt.title(title, pad=80, fontsize=80)
    plt.xlabel('Rectangles', labelpad=50, fontsize=60)
    plt.ylabel('Rectangles', labelpad=50, fontsize=60)

    # Add custom labels to x and y axes
    plt.xticks(ticks=np.arange(num_rectangles), labels=labels, rotation=90, ha='center', fontsize=50)
    plt.yticks(ticks=np.arange(num_rectangles), labels=labels, fontsize=50)

    # Add gridlines
    plt.grid(False)

    # Add colorbar with adjusted font size
    cbar = plt.colorbar(label='Connection (0=No, 1=Yes)')
    cbar.ax.tick_params(labelsize=40)
    cbar.set_label('Connection (0=No, 1=Yes)', fontsize=50)

    # Annotate cells with values, using white text for darker cells
    for (i, j), value in np.ndenumerate(adjacency_matrix):
        color = 'white' if value == 1 else 'black'
        plt.text(j, i, f'{value:.0f}', ha='center', va='center', color=color, fontsize=40)

    # Save the plot
    plot_filename = f"{title.replace(' ', '_').lower()}_adjacency_matrix.png"
    plt.savefig(plot_filename, bbox_inches='tight')
    print(f"[INFO] Adjacency matrix plot saved as: {plot_filename}")
    
    # Show the plot in the notebook
    plt.show()
    plt.close()

def process_and_plot_adjacency_matrices(rectangle_json_files, adjacency_matrix_files):
    """
    Process all adjacency matrix files and their corresponding rectangle labels to plot.
    """
    if len(rectangle_json_files) != len(adjacency_matrix_files):
        print("[ERROR] Mismatch in the number of rectangle and adjacency matrix files.")
        return

    for rect_file, adj_file in zip(rectangle_json_files, adjacency_matrix_files):
        # Load rectangle labels
        if not os.path.exists(rect_file):
            print(f"[ERROR] Rectangle JSON file not found: {rect_file}. Skipping.")
            continue
        
        with open(rect_file, "r") as f:
            rectangle_data = json.load(f)
        
        rectangles = rectangle_data.get("rectangles", [])
        if not rectangles:
            print(f"[ERROR] No 'rectangles' key found in {rect_file}. Skipping.")
            continue

        labels = [f"Rectangle {i + 1}" for i in range(len(rectangles))]

        # Load adjacency matrix
        if not os.path.exists(adj_file):
            print(f"[ERROR] Adjacency matrix file not found: {adj_file}. Skipping.")
            continue
        
        with open(adj_file, "r") as f:
            adjacency_matrix = np.array(json.load(f))

        if adjacency_matrix.shape[0] != len(labels):
            print(f"[ERROR] Mismatch between adjacency matrix size and number of labels for {adj_file}. Skipping.")
            continue

        # Generate title from rectangle file name
        title = os.path.basename(rect_file).replace("_text.json", "").replace("_", " ").title()

        # Plot adjacency matrix
        plot_adjacency_matrix_with_labels(adjacency_matrix, labels, title)

# Usage Example
if 'rectangle_json_files' in globals() and 'adjacency_matrix_files' in globals():
    print("[INFO] Plotting adjacency matrices for all files...")
    process_and_plot_adjacency_matrices(rectangle_json_files, adjacency_matrix_files)
else:
    print("[ERROR] Required variables (rectangle_json_files or adjacency_matrix_files) are missing.")


In [None]:
import networkx as nx
import numpy as np
import json
import os

# Initialize a global list to store GraphML file locations
graphml_files = []

def generate_graphml(rectangle_json_file, adjacency_matrix_file, output_directory="graphml_files"):
    """
    Generate a GraphML file for a single pair of rectangle JSON and adjacency matrix file.
    """
    # Load rectangle text mapping JSON
    if not os.path.exists(rectangle_json_file):
        print(f"[ERROR] Rectangle JSON file not found: {rectangle_json_file}. Skipping.")
        return
    
    with open(rectangle_json_file, "r") as file:
        data = json.load(file)

    # Create a dictionary mapping rectangle numbers to their summarised text
    rectangle_text_map = {str(item["Rectangle No"]): item["Text"] for item in data["rectangles"]}

    # Load the adjacency matrix
    if not os.path.exists(adjacency_matrix_file):
        print(f"[ERROR] Adjacency matrix file not found: {adjacency_matrix_file}. Skipping.")
        return

    with open(adjacency_matrix_file, "r") as file:
        adjacency_matrix = np.array(json.load(file))

    # Initialize a graph
    graph = nx.Graph()

    # Add nodes with attributes (rectangle number and text)
    for rect_no, text in rectangle_text_map.items():
        graph.add_node(rect_no, label=f"Rectangle {rect_no}", text=text)

    # Add edges based on the adjacency matrix
    num_rectangles = adjacency_matrix.shape[0]
    for i in range(num_rectangles):
        for j in range(num_rectangles):
            if adjacency_matrix[i, j]:  # If there is a connection
                graph.add_edge(str(i + 1), str(j + 1))  # Rectangle numbers start from 1

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # Generate GraphML file name based on rectangle JSON file
    graphml_filename = os.path.join(
        output_directory, f"{os.path.splitext(os.path.basename(rectangle_json_file))[0]}_graph.graphml"
    )
    
    # Save the graph to a GraphML file
    nx.write_graphml(graph, graphml_filename)
    print(f"[SUCCESS] GraphML file created: {graphml_filename}")

    # Append the file location to the global graphml_files list
    graphml_files.append(graphml_filename)

def generate_graphml_files(rectangle_json_files, adjacency_matrix_files, output_directory="graphml_files"):
    """
    Generate GraphML files for all rectangle JSON and adjacency matrix file pairs.
    """
    if not rectangle_json_files or not adjacency_matrix_files:
        print("[ERROR] Either 'rectangle_json_files' or 'adjacency_matrix_files' is empty.")
        return

    if len(rectangle_json_files) != len(adjacency_matrix_files):
        print(f"[ERROR] Mismatch in the number of rectangle JSON files and adjacency matrix files.")
        print(f"Rectangle files: {len(rectangle_json_files)}, Adjacency matrix files: {len(adjacency_matrix_files)}")
        return

    for rect_file, adj_file in zip(rectangle_json_files, adjacency_matrix_files):
        print(f"[INFO] Generating GraphML for: {rect_file} and {adj_file}")
        generate_graphml(rect_file, adj_file, output_directory)

    # Print the collected GraphML file locations
    print("[INFO] All generated GraphML files:")
    for graphml_file in graphml_files:
        print(f" - {graphml_file}")

# Usage Example
if 'rectangle_json_files' in globals() and 'adjacency_matrix_files' in globals():
    print("[INFO] Generating GraphML files for all processed files...")
    generate_graphml_files(rectangle_json_files, adjacency_matrix_files)
else:
    print("[ERROR] Required variables (rectangle_json_files or adjacency_matrix_files) are missing.")


In [None]:
!pip install pandas

In [None]:
import networkx as nx
import json
import numpy as np
import matplotlib.pyplot as plt
from networkx.algorithms.community import girvan_newman, louvain_communities
from networkx.algorithms.community.quality import modularity
import os
import pandas as pd

# Global variables to store results
community_results_global = {}
community_summaries = {}
labels = {}

# Load rectangle text mapping
def load_rectangle_text_mapping(rectangle_json_file):
    with open(rectangle_json_file, "r") as file:
        data = json.load(file)
    return {str(item["Rectangle No"]): item["Text"] for item in data["rectangles"]}

# Girvan-Newman community detection
def girvan_newman_communities(G, depth=1):
    communities_generator = girvan_newman(G)
    for _ in range(depth):
        try:
            communities = next(communities_generator)
        except StopIteration:
            print("[INFO] Reached the end of community splits.")
            return []
    return [list(community) for community in communities]

# Louvain community detection
def louvain_community_detection(G):
    return list(louvain_communities(G))

# Calculate modularity
def calculate_modularity(G, communities):
    return modularity(G, communities)

# Visualise graph with communities
def draw_graph_with_communities(G, communities_gn, communities_lv, node_labels, output_directory, output_filename):
    pos = nx.spring_layout(G, seed=42, k=0.6)
    fig, axes = plt.subplots(1, 2, figsize=(96, 64))

    colors_gn = plt.get_cmap('tab20', len(communities_gn))
    colors_lv = plt.get_cmap('tab20', len(communities_lv))

    # Girvan-Newman
    axes[0].set_title('Girvan-Newman Communities', fontsize=48, fontweight='bold')
    for i, comm in enumerate(communities_gn):
        nx.draw_networkx_nodes(G, pos, nodelist=comm, ax=axes[0], node_color=[colors_gn(i)], node_size=7000, alpha=0.9, label=f'Community {i + 1}')
    nx.draw_networkx_edges(G, pos, ax=axes[0], alpha=0.7, edge_color="gray", width=5)
    nx.draw_networkx_labels(G, pos, labels=node_labels, ax=axes[0], font_size=24, font_color="black")
    axes[0].axis('off')

    handles_gn, labels_gn = axes[0].get_legend_handles_labels()
    axes[0].legend(handles_gn, labels_gn, loc='best', fontsize=36)

    # Louvain
    axes[1].set_title('Louvain Communities', fontsize=48, fontweight='bold')
    for i, comm in enumerate(communities_lv):
        nx.draw_networkx_nodes(G, pos, nodelist=comm, ax=axes[1], node_color=[colors_lv(i)], node_size=7000, alpha=0.9, label=f'Community {i + 1}')
    nx.draw_networkx_edges(G, pos, ax=axes[1], alpha=0.7, edge_color="gray", width=5)
    nx.draw_networkx_labels(G, pos, labels=node_labels, ax=axes[1], font_size=24, font_color="black")
    axes[1].axis('off')

    handles_lv, labels_lv = axes[1].get_legend_handles_labels()
    axes[1].legend(handles_lv, labels_lv, loc='best', fontsize=36)

    plt.suptitle('Graph with Community Detection', fontsize=64, fontweight='bold')

    os.makedirs(output_directory, exist_ok=True)
    output_path = os.path.join(output_directory, f"{output_filename}_communities.png")
    plt.savefig(output_path, bbox_inches='tight')
    plt.show()
    print(f"[SUCCESS] Community plot saved to: {output_path}")

    return output_path

# Display results in a table
def display_results_as_table(community_results):
    df = pd.DataFrame([community_results])
    print("\n[INFO] Community Detection Results:")
    print(df)

# Process a single graph
def process_graph(graphml_file, rectangle_json_file, output_directory="community_results", depth=1):
    if not os.path.exists(graphml_file):
        print(f"[ERROR] GraphML file not found: {graphml_file}. Skipping.")
        return

    if not os.path.exists(rectangle_json_file):
        print(f"[ERROR] Rectangle JSON file not found: {rectangle_json_file}. Skipping.")
        return

    # Load graph and rectangle data
    graph = nx.read_graphml(graphml_file, node_type=str)
    rectangle_text_map = load_rectangle_text_mapping(rectangle_json_file)
    node_labels = {node: rectangle_text_map.get(node, f"Node {node}") for node in graph.nodes}

    labels[graphml_file] = list(node_labels.values())

    # Detect communities
    communities_gn = girvan_newman_communities(graph, depth=depth)
    communities_lv = louvain_community_detection(graph)

    if not communities_gn or not communities_lv:
        print(f"[WARNING] No communities detected in {graphml_file}. Skipping.")
        return

    # Visualise and save graph
    output_filename = os.path.splitext(os.path.basename(graphml_file))[0]
    plot_path = draw_graph_with_communities(graph, communities_gn, communities_lv, node_labels, output_directory, output_filename)

    # Calculate modularity
    girvan_mod = calculate_modularity(graph, communities_gn)
    louvain_mod = calculate_modularity(graph, communities_lv)

    # Convert sets to lists for JSON serialization
    communities_gn = [list(comm) for comm in communities_gn]
    communities_lv = [list(comm) for comm in communities_lv]

    # Store results
    community_results = {
        "Girvan-Newman": {
            "Communities": communities_gn,  # Actual community nodes
            "Count": len(communities_gn),
            "Modularity": girvan_mod
        },
        "Louvain": {
            "Communities": communities_lv,  # Actual community nodes
            "Count": len(communities_lv),
            "Modularity": louvain_mod
        },
        "Plot Path": plot_path
    }
    community_results_global[graphml_file] = community_results

    summary = {
        "graphml_file": graphml_file,
        "Girvan-Newman": {
            "Count": len(communities_gn),
            "Modularity": girvan_mod
        },
        "Louvain": {
            "Count": len(communities_lv),
            "Modularity": louvain_mod
        }
    }
    community_summaries[graphml_file] = summary

    # Save individual results to a JSON file
    os.makedirs(output_directory, exist_ok=True)
    json_output_file = os.path.join(output_directory, f"{output_filename}_community_results.json")
    with open(json_output_file, "w") as outfile:
        json.dump(community_results, outfile, indent=4)
    print(f"[SUCCESS] Community results saved to: {json_output_file}")

    # Display results in table format
    display_results_as_table(community_results)

    print(f"[INFO] Processed {graphml_file} successfully.\n")

# Process multiple files
def process_multiple_files(graphml_files, rectangle_json_files, output_directory="community_results", depth=1):
    if not graphml_files or not rectangle_json_files:
        print("[ERROR] GraphML or rectangle JSON files are missing.")
        return

    if len(graphml_files) != len(rectangle_json_files):
        print(f"[ERROR] Mismatch in the number of GraphML and rectangle JSON files.")
        return

    for graphml_file, rectangle_json_file in zip(graphml_files, rectangle_json_files):
        print(f"[INFO] Processing: {graphml_file} with {rectangle_json_file}")
        process_graph(graphml_file, rectangle_json_file, output_directory, depth)

    # Print summary of results
    print("\n[INFO] Community Summaries for all files:")
    print(json.dumps(community_summaries, indent=4))

# Usage Example
if (
    'graphml_files' in globals() and graphml_files and
    'rectangle_json_files' in globals() and rectangle_json_files
):
    process_multiple_files(graphml_files, rectangle_json_files, output_directory="community_results", depth=1)
else:
    print("[ERROR] Required variables (graphml_files or rectangle_json_files) are missing.")


In [None]:
!pip install node2vec

In [None]:
# Dynamically load all GraphML files stored in the global list
graphs = {}

for graphml_file in graphml_files:
    graph_name = os.path.splitext(os.path.basename(graphml_file))[0]
    print(f"[INFO] Loading GraphML: {graphml_file} as '{graph_name}'")
    graphs[graph_name] = nx.read_graphml(graphml_file, node_type=str)


In [None]:
def load_rectangle_text_mapping(graph_name, processed_files_directory):
    """
    Loads rectangle text mapping for a given graph name from the processed files directory.
    Maps node numbers (like '1', '2', ...) to their text content.
    """
    # Adjust file name to match earlier filename convention
    base_name = graph_name.replace("_text_graph", "")
    json_path = os.path.join(processed_files_directory, f"{base_name}_text.json")

    if not os.path.exists(json_path):
        print(f"[ERROR] Rectangle text mapping file not found: {json_path}")
        return {}

    with open(json_path, "r") as infile:
        data = json.load(infile)

    # Create a dictionary mapping node numbers to text content
    return {
        str(rect['Rectangle No']): rect['Text']
        for rect in data.get("rectangles", [])
    }


def girvan_newman_first_level_with_texts(G, rectangle_text_mapping):
    """
    Runs the Girvan-Newman algorithm to find first-level communities and 
    replaces node identifiers with their corresponding text content.
    Also returns valid node identifiers for modularity calculation.
    """
    print("[INFO] Starting Girvan-Newman community detection for first-level communities...")

    try:
        # Generate the first level of communities
        communities_generator = girvan_newman(G)
        first_level_communities = next(communities_generator)
    except StopIteration:
        print("[WARN] No communities could be detected.")
        return [], []

    # Separate communities as node identifiers and mapped text content
    communities_with_texts = []
    communities_with_identifiers = []

    for community in first_level_communities:
        communities_with_texts.append(
            [rectangle_text_mapping.get(node, f"[Text not found for {node}]") for node in community]
        )
        communities_with_identifiers.append(list(community))

    return communities_with_identifiers, communities_with_texts


def process_all_graphs_first_level_with_texts(graphs, processed_files_directory):
    """
    Processes all graphs to detect first-level communities, replacing node identifiers with text content,
    and also returning node identifiers for modularity calculations.
    """
    all_communities = {}  # Store results by graph name

    for graph_name, graph in graphs.items():
        print(f"\n[INFO] Processing graph: {graph_name}")

        # Load rectangle text mapping
        rectangle_text_mapping = load_rectangle_text_mapping(graph_name, processed_files_directory)
        if not rectangle_text_mapping:
            print(f"[ERROR] Skipping graph '{graph_name}' due to missing text mapping.")
            continue

        try:
            # Run Girvan-Newman algorithm for first-level communities
            communities_with_identifiers, communities_with_texts = girvan_newman_first_level_with_texts(graph, rectangle_text_mapping)
            all_communities[graph_name] = {
                "file": f"{graph_name}.graphml",
                "communities": communities_with_texts
            }

            print(f"[SUCCESS] First-level communities detected for '{graph_name}':")
            for idx, community in enumerate(communities_with_texts, start=1):
                print(f"  Community {idx}: {community}")

        except Exception as e:
            print(f"[ERROR] Failed to process graph '{graph_name}': {e}")

    return all_communities


# Locate the processed files directory dynamically
processed_files_directory = find_processed_files_directory(graphml_files)

# Run the function and save results
all_graph_first_level_communities_with_texts = process_all_graphs_first_level_with_texts(graphs, processed_files_directory)

# Save the results to a JSON file
output_file_with_texts = "graph_first_level_communities_with_texts.json"
with open(output_file_with_texts, "w") as f:
    json.dump(all_graph_first_level_communities_with_texts, f, indent=4)

print(f"\n[INFO] First-level communities with text content have been saved to '{output_file_with_texts}'.")


In [None]:
def calculate_modularity(G, communities):
    """
    Calculates the modularity of a graph based on the detected communities.
    """
    # Ensure communities are passed as sets of node identifiers
    return modularity(G, [set(community) for community in communities])


modularities = []
for graph_name, graph in graphs.items():
    print(f"\n[INFO] Processing modularity for graph: {graph_name}")
    
    # Load rectangle text mapping
    rectangle_text_mapping = load_rectangle_text_mapping(graph_name, processed_files_directory)
    if not rectangle_text_mapping:
        print(f"[ERROR] Skipping graph '{graph_name}' due to missing text mapping.")
        continue

    # Detect first-level communities and get node identifiers
    communities_with_identifiers, _ = girvan_newman_first_level_with_texts(graph, rectangle_text_mapping)

    # Convert to valid node identifier communities for modularity calculation
    node_identifier_communities = [
        set(community) for community in communities_with_identifiers if set(community).issubset(graph.nodes)
    ]
    
    if node_identifier_communities:
        # Calculate modularity based on the detected communities
        mod = calculate_modularity(graph, node_identifier_communities)
        modularities.append(mod)
        print(f"[SUCCESS] Modularity for {graph_name}: {mod:.4f}")
    else:
        print(f"[ERROR] Communities could not be found for '{graph_name}'.")

# Compute and display the average modularity
if modularities:
    average_modularity = sum(modularities) / len(modularities)
    print(f"\n[INFO] Average Modularity: {average_modularity:.4f}")
else:
    print("[WARN] No modularity values to average.")


In [None]:
def collect_graph_community_information(graphs, processed_files_directory):
    """
    Collects community information for each graph without displaying plots.
    Returns a dictionary with graph names as keys and community information as values.
    """
    all_graph_community_info = {}

    for graph_name, graph in graphs.items():
        print(f"\n[INFO] Processing community information for: {graph_name}")

        # Load rectangle text mapping
        rectangle_text_mapping = load_rectangle_text_mapping(graph_name, processed_files_directory)

        # Detect first-level communities
        communities_with_identifiers, communities_with_texts = girvan_newman_first_level_with_texts(graph, rectangle_text_mapping)

        if communities_with_identifiers:
            # Store the information
            all_graph_community_info[graph_name] = {
                "communities_with_identifiers": communities_with_identifiers,
                "communities_with_texts": communities_with_texts,
                "node_count": graph.number_of_nodes(),
                "edge_count": graph.number_of_edges()
            }
            print(f"[SUCCESS] Community information collected for: {graph_name}")
        else:
            print(f"[WARN] No communities found for graph: {graph_name}")

    return all_graph_community_info


# Collect community information for all graphs
all_community_info = collect_graph_community_information(graphs, processed_files_directory)

# Save the community information to a JSON file
output_file = "graph_community_information.json"
with open(output_file, "w") as f:
    json.dump(all_community_info, f, indent=4)

print(f"\n[INFO] Community information saved to '{output_file}'.")


In [None]:
from node2vec import Node2Vec

def node2vec_embedding(G):
    """
    Generates node embeddings for a graph using Node2Vec.
    """
    n2v = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, seed=42)
    model = n2v.fit(window=10, min_count=1, sg=1)
    return model

# Dictionary to store embeddings for all graphs
embeddings = {}

for graph_name, graph in graphs.items():
    print(f"\n[INFO] Generating embeddings for graph: {graph_name}")
    try:
        model = node2vec_embedding(graph)
        embeddings[graph_name] = model
        print(f"[SUCCESS] Embeddings generated for: {graph_name}")
    except Exception as e:
        print(f"[ERROR] Failed to generate embeddings for '{graph_name}': {e}")

# Save the embeddings to files
for graph_name, model in embeddings.items():
    model.wv.save_word2vec_format(f"{graph_name}_embeddings.txt")
    print(f"[INFO] Embeddings saved to '{graph_name}_embeddings.txt'")


In [None]:
def average_community_embeddings(G, communities, model):
    """
    Calculates the average embeddings for each community in the graph.
    """
    community_embeddings = {}
    for i, comm in enumerate(communities):
        comm_nodes = list(comm)
        comm_embeddings = [model.wv[node] for node in comm_nodes if node in model.wv]
        if comm_embeddings:
            avg_embedding = np.mean(comm_embeddings, axis=0)
            community_embeddings[f"Community {i + 1}"] = avg_embedding
    return community_embeddings


community_avg_embeddings = {}

for graph_name, graph in graphs.items():
    print(f"\n[INFO] Calculating average embeddings for communities in: {graph_name}")
    
    # Get communities and model for the graph
    communities_with_identifiers, _ = girvan_newman_first_level_with_texts(graph, {})
    model = embeddings.get(graph_name)

    if model and communities_with_identifiers:
        avg_embeddings = average_community_embeddings(graph, communities_with_identifiers, model)
        community_avg_embeddings[graph_name] = avg_embeddings
        print(f"[SUCCESS] Average embeddings calculated for: {graph_name}")
    else:
        print(f"[ERROR] Missing model or communities for graph: {graph_name}")

# Save the average community embeddings to a file
output_avg_embeddings_file = "community_avg_embeddings.json"
with open(output_avg_embeddings_file, "w") as f:
    json.dump({k: {ck: v.tolist() for ck, v in v.items()} for k, v in community_avg_embeddings.items()}, f, indent=4)

print(f"\n[INFO] Average community embeddings saved to '{output_avg_embeddings_file}'.")


In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

def prepare_clustering_data(community_avg_embeddings):
    """
    Prepares data and labels for clustering from community average embeddings.
    """
    data = []
    labels = []
    for graph_name, embeddings in community_avg_embeddings.items():
        for comm_str, embedding in embeddings.items():
            data.append(embedding)
            labels.append(f"{graph_name}_{comm_str}")
    return np.array(data), labels


# Prepare data for clustering
data, labels = prepare_clustering_data(community_avg_embeddings)

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
data_tsne = tsne.fit_transform(data)

# Standardise the reduced data
scaler = StandardScaler()
data_tsne = scaler.fit_transform(data_tsne)

print("[INFO] Data prepared and transformed using t-SNE.")


In [None]:
from sklearn.cluster import MeanShift
from sklearn.metrics import silhouette_score

def perform_clustering(data, bandwidth):
    """
    Performs MeanShift clustering on the data with the given bandwidth.
    """
    clustering = MeanShift(bandwidth=bandwidth)
    clustering.fit(data)
    return clustering


def find_best_bandwidth(data, bandwidth_values):
    """
    Finds the best bandwidth for MeanShift clustering using silhouette scores.
    """
    best_score = -1
    best_bandwidth = None
    scores = []

    for bandwidth in bandwidth_values:
        try:
            meanshift = MeanShift(bandwidth=bandwidth)
            cluster_labels = meanshift.fit_predict(data)

            num_labels = len(np.unique(cluster_labels))
            if 2 <= num_labels < len(data):
                score = silhouette_score(data, cluster_labels)
                scores.append(score)
                print(f'Bandwidth = {bandwidth}, Silhouette Score = {score:.3f}')

                if score > best_score:
                    best_score = score
                    best_bandwidth = bandwidth
            else:
                scores.append(-1)
                print(f'Bandwidth = {bandwidth}, resulted in {num_labels} clusters, not suitable for silhouette score.')
        except Exception as e:
            scores.append(-1)
            print(f'[ERROR] Bandwidth = {bandwidth} failed: {e}')

    return best_bandwidth, best_score, scores


# Define bandwidth values to test
bandwidth_values = np.linspace(0.5, 1, 10)

# Find the best bandwidth and corresponding silhouette score
best_bandwidth, best_score, scores = find_best_bandwidth(data_tsne, bandwidth_values)

print(f'\n[INFO] Best Bandwidth: {best_bandwidth}, Best Silhouette Score: {best_score:.3f}')

# Plot silhouette scores vs bandwidth
plt.figure(figsize=(10, 6))
plt.plot(bandwidth_values, scores, marker='o')
plt.title('Silhouette Score vs Bandwidth for MeanShift')
plt.xlabel('Bandwidth')
plt.ylabel('Silhouette Score')
plt.grid()
plt.show()



In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def evaluate_silhouette_score(data, labels):
    """
    Evaluates the clustering using the Silhouette Score.
    """
    score = silhouette_score(data, labels)
    print(f"[INFO] Silhouette Score: {score:.4f}")
    return score


# Perform KMeans clustering
print("[INFO] Performing KMeans clustering...")
try:
    kmeans = KMeans(n_clusters=15, random_state=42, n_init=10)
    kmeans.fit(data_tsne)
    cluster_labels = kmeans.labels_

    # Evaluate clustering performance
    silhouette_score = evaluate_silhouette_score(data_tsne, cluster_labels)
    print(f"[SUCCESS] KMeans clustering completed with silhouette score: {silhouette_score:.4f}")
except Exception as e:
    print(f"[ERROR] KMeans clustering failed: {e}")


In [None]:
# Plotting the clusters using t-SNE with larger figure size
plt.figure(figsize=(100, 80))  # Increase the figure size to 100x80 inches
unique_labels = np.unique(cluster_labels)
colors = plt.cm.get_cmap('tab10', len(unique_labels))

# Plot clusters on the t-SNE reduced data
for label in unique_labels:
    cluster_points = data_tsne[cluster_labels == label]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {label}", c=[colors(label)], s=5000, alpha=0.8)

# Title and axis labels with larger font size
plt.title("t-SNE Visualization of KMeans Clustering", fontsize=60)
plt.xlabel("t-SNE Component 1", fontsize=40)
plt.ylabel("t-SNE Component 2", fontsize=40)

# Increase tick label sizes (values on the x and y axis)
plt.tick_params(axis='x', labelsize=40)  # Increase x-axis tick label size
plt.tick_params(axis='y', labelsize=40)  # Increase y-axis tick label size

# Adjust legend size and location
plt.legend(fontsize=40, loc='best')

# Group labels into clusters
clusters = {i: [] for i in range(len(unique_labels))}

for idx, cluster_label in enumerate(cluster_labels):
    clusters[cluster_label].append(labels[idx])

# Print community details with actual text
print("\n[INFO] Cluster details with community texts:")

for cluster_label, communities in clusters.items():
    print(f"\nCluster {cluster_label}:")
    for community in communities:
        # Extract the graph name and community number from the label
        parts = community.split('_', 1)  # Split into graph_name and the rest (community ID)
        
        if len(parts) == 2:
            graph_name, comm_str = parts
        else:
            # Handle cases where the label does not follow the expected format
            print(f"Skipping invalid community label: {community}")
            continue
        
        # Get the corresponding community number from the label
        community_number = comm_str.split()[-1]  # Extract the number, e.g., '8' from 'Community 8'
        
        # Get the corresponding text content for the community based on the community number
        community_text = rectangle_text_mapping.get(community_number, f"[Text not found for {community_number}]")
        
        print(f"  - {community}: {community_text}")

# Display the plot
plt.show()
