# All cluster 6 projects code

In [8]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback

# Define paths and project range
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/rawdata_cluster6_style_G_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/rawdata_cluster6_style_G_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/scraping_cluster6_style_G_log.txt"
PROJECT_RANGE = range(943, 1003)  # Example range for q_ids in Clusters  8

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0

def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    return value

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.

    Args:
        title (str): The table title string.

    Returns:
        str: The extracted specific phrase if found, else the original title.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "ADNU",
        "LDNU",
        "RNU"
    ]

    for phrase in phrases:
        if re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback to the entire title if no specific phrase is found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()

    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()

    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"  # Adjust latitude sign
        longitude = lon if "E" in text.upper() else f"-{lon}"  # Adjust longitude sign
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)

    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def extract_table1(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    Implements a retry mechanism with different table extraction settings if initial attempts fail.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.

    Returns:
        str: Extracted Point of Interconnection value,
             "Value Missing" if label found but no value,
             or None if not found.
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None

    # Define the regex pattern for 'Point of Interconnection' (case-insensitive)
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)

    # Define different table extraction settings to try
    table_settings_list = [
        {
            "horizontal_strategy": "text",
            "vertical_strategy": "lines",
            "snap_tolerance": 1,
        },
        {
            "horizontal_strategy": "lines",
            "vertical_strategy": "lines",
            "snap_tolerance": 2,  # Increased tolerance for retry
        }
    ]

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify all pages that contain "Table 1"
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*1\b", text, re.IGNORECASE):
                    table1_pages.append(i)

            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None  # Return None if no Table 1 found

            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1  # Plus one to include the next page if needed

            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)

            # Flag to indicate if extraction was successful
            extraction_successful = False

            # Iterate through the specified page range
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)

                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)

                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue  # Skip empty tables

                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)

                        # Iterate through each row in the table
                        for row_index, row in enumerate(tab, start=1):
                            # Iterate through each cell in the row
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    # Assuming the next column contains the value
                                    poi_col_index = cell_index  # 1-based index
                                    adjacent_col_index = poi_col_index + 1  # Next column

                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:  # Check if the value is not empty
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break  # Exit the cell loop
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            # Proceed to scan surrounding rows for the value
                                            poi_value_parts = []

                                            # Define the range to scan: two rows above and two rows below
                                            # Convert to 0-based index
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)  # Exclusive

                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)

                                            for scan_row_index in range(start_scan, end_scan):
                                                # Skip the current row where the label was found
                                                if scan_row_index == current_row_idx:
                                                    continue

                                                scan_row = tab[scan_row_index]
                                                # Ensure the adjacent column exists in the scan row
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        # If another POI label is found, skip it
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue

                                            if poi_value_parts:
                                                # Concatenate the parts to form the complete POI value
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' "
                                                      f"(Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break  # Exit the cell loop
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows "
                                                      f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                                # Do not return immediately; proceed to retry
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column "
                                              f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                        # Do not return immediately; proceed to retry
                            if extraction_successful:
                                break  # Exit the row loop
                        if extraction_successful:
                            break  # Exit the table loop
                    if extraction_successful:
                        break  # Exit the attempt loop
                if extraction_successful:
                    break  # Exit the page loop

    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None

    if not extraction_successful:
        # After all attempts, determine the appropriate return value
        if point_of_interconnection is not None and point_of_interconnection != "":
            # Label was found but no value
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            # Label not found
            print("Point of Interconnection not found in Table 1.", file=log_file)
            return None

    return point_of_interconnection



def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text

        text = clean_string_cell(text)

        queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id = queue_id.group(1) if queue_id else str(project_id)  # Use project_id if queue_id is not found
        print(f"Extracted Queue ID: {queue_id}", file=log_file)

        cluster_number = re.search(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = cluster_number.group(1) if cluster_number else None
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)

        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)

        # Extract Capacity
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)

        # Extract Point of Interconnection
        point_of_interconnection = extract_table1(pdf_path, log_file)

        latitude, longitude = search_gps_coordinates(text, log_file)

        # Initialize base data dictionary
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }

        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)

    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()  # Return empty DataFrame on error


 
 
def adjust_rows_length(data_rows, headers):
    """Ensure each row in data_rows matches the length of headers by truncating or padding."""
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""]*(col_count - len(row)))

def extract_table7(pdf_path, log_file, is_addendum=False):
    """
    Extracts Table 7 data from the provided PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.
        is_addendum (bool): Whether the PDF is an addendum.

    Returns:
        pd.DataFrame: Extracted Table 7 data.
    """
    print(f"\nProcessing {pdf_path} for Table 7 extraction...", file=log_file)
    extracted_tables = []
    specific_phrase = None

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain "Table 7"
            table7_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*7?\d*", text, re.IGNORECASE):
                    table7_pages.append(i)

            if not table7_pages:
                print("No Table 7 found in the PDF.", file=log_file)
                return pd.DataFrame()

            first_page = table7_pages[0]
            last_page = table7_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1

            print(f"Table 7 starts on page {scrape_start + 1} and ends on page {scrape_end}", file=log_file)

            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                })

                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue

                    table_bbox = table.bbox
                    title_bbox = (0, 0, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None

                    if title_text:
                        title_lines = title_text.split('\n')[::-1]
                        for line in title_lines:
                            line = line.strip()
                            match = re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*[:\-\s]*(.*)", line, re.IGNORECASE)
                            if match:
                                table_title = match.group(2).strip()
                                break

                    if table_title:
                        # New Table 7 detected
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New Table 7 detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)

                        headers = clean_column_headers(tab[0])
                        data_rows = tab[1:]

                        # Create DataFrame for new table
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                        # Handle new ADNU tables (grouping logic)
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    # Group all adnu rows into one 'upgrade' row
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()

                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })

                                    for col, value in other_columns.items():
                                        df_grouped[col] = value

                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    # If 'type of upgrade' exists, just rename adnu if needed
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row none, original logic replaced only first row if needed
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' first row for new ADNU table.", file=log_file)
                        else:
                            # Non-ADNU new tables
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                # If exist and none in first row original logic replaced only first row
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for first row in new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase

                        if df_new.columns.duplicated().any():
                            print("Duplicate columns detected in new table. Dropping duplicates.", file=log_file)
                            df_new = df_new.loc[:, ~df_new.columns.duplicated()]

                        extracted_tables.append(df_new)
                    else:
                        # Continuation Table
                        if specific_phrase is None:
                            print(f"No previous Table 7 title found for continuation on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue

                        print(f"Continuation Table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab

                        # Check if the number of columns matches
                        expected_columns = len(extracted_tables[-1].columns) if extracted_tables else None
                        if expected_columns is None:
                            print(f"No existing table to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.")
                            continue  # No table to continue with

                        # Define expected columns based on the last extracted table
                        expected_headers = extracted_tables[-1].columns.tolist()

                        # Detect header row in continuation
                        header_keywords = ["type of upgrade", "adnu"]
                        first_continuation_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            re.search(rf"\b{kw}\b", str(cell), re.IGNORECASE) for kw in header_keywords for cell in first_continuation_row
                        )

                        if is_header_row:
                            print(f"Detected header row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            actual_header_row = data_rows[0]
                            actual_headers = clean_column_headers(actual_header_row)
                             

                            # Go back to previous page bounding box to update table title if needed
                            if page_number > 0:
                                previous_page = pdf.pages[page_number - 1]
                                bbox_lower_region = (0, table_bbox[1], previous_page.width, previous_page.height)
                                title_text_previous = previous_page.within_bbox(bbox_lower_region).extract_text() or ""
                                new_table_title = None
                                if title_text_previous:
                                    title_lines_prev = title_text_previous.split('\n')[::-1]
                                    for line in title_lines_prev:
                                        line = line.strip()
                                        match_prev = re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*[:\-\s]*(.*)", line, re.IGNORECASE)
                                        if match_prev:
                                            new_table_title = match_prev.group(2).strip()
                                            break
                                if new_table_title:
                                    specific_phrase = extract_specific_phrase(new_table_title)
                                    print(f"Updated table title from previous page: '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                else:
                                    print("No table title found in previous page region. Using existing specific_phrase.", file=log_file)
                            else:
                                print("No previous page available for title extraction for continuation table. Using existing specific_phrase.", file=log_file)



                        
                            # Handle continuation ADNU or non-ADNU
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                    
                                print("Continuation ADNU table detected. No grouping, just rename and type of upgrade handling.", file=log_file)
                                  
                                    
                                #if "adnu" in data_rows[0]:
                                    #
                                 #   print("Handling continuation for 'Area Delivery Netowrk Upgrade' table")
                                 #   print("Renaming 'ADNU' to 'upgrade' in continuation table.")
                                 #   
                                 #   data_rows = [ ["upgrade"] + row[1:] for row in data_rows ]
                                    

                                if "type of upgrade" not in data_rows[0]:
                                    # Insert 'type of upgrade' column at the beginning
                                    print("Inserting 'type of upgrade' column with specific phrase in continuation table.",file=log_file)
                                    data_rows = [ [specific_phrase] + row for row in data_rows ]


                                    if "ADNU" in data_rows[0]:
                                        print("Handling continuation for 'Area Delivery Network Upgrade' table",file=log_file)
                                        print("Renaming 'ADNU' to 'upgrade' in continuation table.",file=log_file)
                                        # Find the index where "ADNU" occurs in the first row
                                        adnu_idx = data_rows[0].index("ADNU")
                                        # Replace "ADNU" with "upgrade" in that column for every row
                                        #for r in range(len(data_rows)):
                                        data_rows[0][adnu_idx] = "upgrade"



                                    
                                
                        # Handle missing or extra columns
                        if len(data_rows[0]) < expected_columns:
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                    # For ADNU tables, assume missing "upgrade" column
                                print(f"Detected missing 'upgrade' column in continuation table on page {page_number + 1}, table {table_index + 1}. Inserting 'upgrade' column.",file=log_file)
                                data_rows = [[specific_phrase] + row for row in data_rows]
                            else:
                                    # For other tables, assume missing "Type of Upgrade" column
                                print(f"Detected missing 'Type of Upgrade' column in continuation table on page {page_number + 1}, table {table_index + 1}. Inserting 'Type of Upgrade' column.",file=log_file)
                                data_rows = [[specific_phrase] + row for row in data_rows]
                        elif len(data_rows[0]) > expected_columns:
                            # Extra columns detected; adjust accordingly
                            print(f"Detected extra columns in continuation table on page {page_number + 1}, table {table_index + 1}. Dropping extra columns.",file=log_file)
                            data_rows = [row[:expected_columns] for row in data_rows]
                            
                        


                        

                        # Create DataFrame for the continuation table

                        if is_header_row:    
                            data_rows = data_rows[1:]
                            print(f"Dropped header row from data_rows after modifications for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
    
                        try:
                            df_continuation = pd.DataFrame(data_rows, columns=expected_headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}")
                            continue  # Skip this table due to error

                            # Special Handling for "Area Delivery Network Upgrade" Tables in Continuation
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing 'None' in 'type of upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                # If "type of upgrade" column does not exist, add it
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                        else:
                            # General Handling for other tables
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing 'None' in 'Type of Upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                # If "Type of Upgrade" column does not exist, add it
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'Type of Upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)

                        # Ensure no duplicate columns
                        if df_continuation.columns.duplicated().any():
                            print(f"Duplicate columns detected in continuation table on page {page_number + 1}, table {table_index + 1}. Dropping duplicates.",file=log_file)
                            df_continuation = df_continuation.loc[:, ~df_continuation.columns.duplicated()]

                        # Merge with the last extracted table
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)            



                                         
    except Exception as e:
        print(f"Error processing Table 7 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # After processing all tables, concatenate them
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())

        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)

        print("\nConcatenating all extracted Table 7 data...", file=log_file)
        try:
            table7_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table7_data = pd.DataFrame()
    else:
        print("No Table 7 data extracted.", file=log_file)
        table7_data = pd.DataFrame()

    return table7_data


 

 

 




def extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Table 7 data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table7_data = extract_table7(pdf_path, log_file, is_addendum)

    if table7_data.empty:
        return base_data
    else:
        # Identify overlapping columns excluding 'point_of_interconnection'
        overlapping_columns = base_data.columns.intersection(table7_data.columns).difference(['point_of_interconnection'])
        table7_data = table7_data.drop(columns=overlapping_columns, errors='ignore')
        
        # Repeat base data for each row in table7_data
        base_data_repeated = pd.concat([base_data] * len(table7_data), ignore_index=True)
        
        try:
            # Concatenate base data with Table 7 data along columns
            merged_df = pd.concat([base_data_repeated, table7_data], axis=1, sort=False)
            
            # Ensure 'point_of_interconnection' is present and correctly populated
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            
            print(f"Merged base data with Table 7 data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Table 7 data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data  # Fallback to base data only



            

def check_has_table7(pdf_path):
    """Checks if the PDF contains Table 7."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*", text, re.IGNORECASE):
                    return True
    except Exception as e:
        # Handle potential errors when opening PDF
        return False
    return False

def is_addendum(pdf_path):
    """Checks if the PDF is an addendum by searching 'Addendum' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False
            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            return "Addendum" in text
    except Exception as e:
        # Handle potential errors when opening PDF
        return False

def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped

    # Ensure the log file directory exists
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)

    with open(LOG_FILE_PATH, 'w') as log_file:
        for project_id in PROJECT_RANGE:
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue

            project_scraped = False  # Flag to track if any PDF in the project was scraped
            base_data_extracted = False
            base_data = pd.DataFrame()

            for pdf_name in os.listdir(project_path):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(project_path, pdf_name)
                    total_pdfs_accessed += 1
                    is_add = is_addendum(pdf_path)

                    # Determine output DataFrame and CSV path based on addendum status
                    if is_add:
                        addendum_pdfs.append(pdf_name)
                        print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                    else:
                        original_pdfs.append(pdf_name)
                        print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)

                    try:
                        has_table7 = check_has_table7(pdf_path)
                        if not has_table7:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)", file=log_file)
                            # Print to ipynb output
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)")
                            total_pdfs_skipped += 1
                            continue

                        if not is_add and not base_data_extracted:
                            # Extract base data from original PDF
                            base_data = extract_base_data(pdf_path, project_id, log_file)
                            base_data_extracted = True
                            print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                        if is_add and base_data_extracted:
                            # For addendums, use the extracted base data
                            table7_data = extract_table7(pdf_path, log_file, is_addendum=is_add)
                            if not table7_data.empty:
                                # Merge base data with Table 7 data
                                merged_df = pd.concat([base_data] * len(table7_data), ignore_index=True)
                                merged_df = pd.concat([merged_df, table7_data], axis=1, sort=False)
                                core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                # Optionally, print to ipynb
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                        else:
                            # For originals, extract Table 7 data
                            df = extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=is_add)
                            if not df.empty:
                                if is_add:
                                    core_addendums = pd.concat([core_addendums, df], ignore_index=True)
                                else:
                                    core_originals = pd.concat([core_originals, df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                # Optionally, print to ipynb
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1

                    except Exception as e:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                        print(traceback.format_exc(), file=log_file)
                        # Optionally, print to ipynb
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                        total_pdfs_skipped += 1

            # After processing all PDFs for this project, check if any PDF was scraped
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)


    # After processing all PDFs, save to CSV
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")

    # Calculate total projects processed
    total_projects_processed = len(scraped_projects) + len(skipped_projects)

    # Print summary to ipynb
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")

    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))

    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))

    print("\nList of Missing Projects:")
    print(sorted(missing_projects))

    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)

    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)

    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)

    print("\nList of Original PDFs:")
    print(original_pdfs)

    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return

    # Clean up the entire DataFrame by cleaning string cells
    df = df.map(clean_string_cell)

    # Drop rows that contain specific phrases (e.g., "Type of Upgrade")
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]

    # Reorder columns as specified
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")

    # Ensure q_id is numeric for sorting, replace missing values with None
    if 'q_id' in df.columns:
        df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')

    # Save the DataFrame to CSV
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Scraped PDF: Appendix A - Q945 LECU.pdf from Project 945

Columns reordered for originals as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 6/rawdata_cluster6_originals.csv
No data to save for addendums.

=== Scraping Summary ===
Total Projects Processed: 1
Total Projects Scraped: 1
Total Projects Skipped: 0
Total Projects Missing: 0
Total PDFs Accessed: 1
Total PDFs Scraped: 1
Total PDFs Skipped: 0

List of Scraped Projects:
[945]

List of Skipped Projects:
[]

List of Missing Projects:
[]

List of Scraped PDFs:
['Appendix A - Q945 LECU.pdf']

List of Skipped PDFs:
[]

List of Addendum PDFs:
[]

List of Original PDFs:
['Appendix A - Q945 LECU.pdf']

Number of Original PDFs Scraped: 1
Number of Addendum PDFs Scraped: 0


# Missed 945 so doing it individually and manually adding it

In [None]:
# just had to add an extra line to check if the data frame is empty before accessing the rows.

In [7]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback

# Define paths and project range
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/rawdata_cluster6_style_G_945.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/rawdata_cluster6_style_G_945_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/scraping_cluster6_style_G_log_945.txt"
PROJECT_RANGE = range(945, 947)  # Example range for q_ids in Clusters  8

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0

def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    return value

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.

    Args:
        title (str): The table title string.

    Returns:
        str: The extracted specific phrase if found, else the original title.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "ADNU",
        "LDNU",
        "RNU"
    ]

    for phrase in phrases:
        if re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback to the entire title if no specific phrase is found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()

    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()

    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"  # Adjust latitude sign
        longitude = lon if "E" in text.upper() else f"-{lon}"  # Adjust longitude sign
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)

    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def extract_table1(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    Implements a retry mechanism with different table extraction settings if initial attempts fail.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.

    Returns:
        str: Extracted Point of Interconnection value,
             "Value Missing" if label found but no value,
             or None if not found.
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None

    # Define the regex pattern for 'Point of Interconnection' (case-insensitive)
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)

    # Define different table extraction settings to try
    table_settings_list = [
        {
            "horizontal_strategy": "text",
            "vertical_strategy": "lines",
            "snap_tolerance": 1,
        },
        {
            "horizontal_strategy": "lines",
            "vertical_strategy": "lines",
            "snap_tolerance": 2,  # Increased tolerance for retry
        }
    ]

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify all pages that contain "Table 1"
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*1\b", text, re.IGNORECASE):
                    table1_pages.append(i)

            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None  # Return None if no Table 1 found

            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1  # Plus one to include the next page if needed

            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)

            # Flag to indicate if extraction was successful
            extraction_successful = False

            # Iterate through the specified page range
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)

                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)

                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue  # Skip empty tables

                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)

                        # Iterate through each row in the table
                        for row_index, row in enumerate(tab, start=1):
                            # Iterate through each cell in the row
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    # Assuming the next column contains the value
                                    poi_col_index = cell_index  # 1-based index
                                    adjacent_col_index = poi_col_index + 1  # Next column

                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:  # Check if the value is not empty
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break  # Exit the cell loop
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            # Proceed to scan surrounding rows for the value
                                            poi_value_parts = []

                                            # Define the range to scan: two rows above and two rows below
                                            # Convert to 0-based index
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)  # Exclusive

                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)

                                            for scan_row_index in range(start_scan, end_scan):
                                                # Skip the current row where the label was found
                                                if scan_row_index == current_row_idx:
                                                    continue

                                                scan_row = tab[scan_row_index]
                                                # Ensure the adjacent column exists in the scan row
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        # If another POI label is found, skip it
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue

                                            if poi_value_parts:
                                                # Concatenate the parts to form the complete POI value
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' "
                                                      f"(Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break  # Exit the cell loop
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows "
                                                      f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                                # Do not return immediately; proceed to retry
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column "
                                              f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                        # Do not return immediately; proceed to retry
                            if extraction_successful:
                                break  # Exit the row loop
                        if extraction_successful:
                            break  # Exit the table loop
                    if extraction_successful:
                        break  # Exit the attempt loop
                if extraction_successful:
                    break  # Exit the page loop

    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None

    if not extraction_successful:
        # After all attempts, determine the appropriate return value
        if point_of_interconnection is not None and point_of_interconnection != "":
            # Label was found but no value
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            # Label not found
            print("Point of Interconnection not found in Table 1.", file=log_file)
            return None

    return point_of_interconnection



def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text

        text = clean_string_cell(text)

        queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id = queue_id.group(1) if queue_id else str(project_id)  # Use project_id if queue_id is not found
        print(f"Extracted Queue ID: {queue_id}", file=log_file)

        cluster_number = re.search(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = cluster_number.group(1) if cluster_number else None
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)

        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)

        # Extract Capacity
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)

        # Extract Point of Interconnection
        point_of_interconnection = extract_table1(pdf_path, log_file)

        latitude, longitude = search_gps_coordinates(text, log_file)

        # Initialize base data dictionary
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }

        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)

    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()  # Return empty DataFrame on error


 
 
def adjust_rows_length(data_rows, headers):
    """Ensure each row in data_rows matches the length of headers by truncating or padding."""
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""]*(col_count - len(row)))

def extract_table7(pdf_path, log_file, is_addendum=False):
    """
    Extracts Table 7 data from the provided PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.
        is_addendum (bool): Whether the PDF is an addendum.

    Returns:
        pd.DataFrame: Extracted Table 7 data.
    """
    print(f"\nProcessing {pdf_path} for Table 7 extraction...", file=log_file)
    extracted_tables = []
    specific_phrase = None

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain "Table 7"
            table7_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*7?\d*", text, re.IGNORECASE):
                    table7_pages.append(i)

            if not table7_pages:
                print("No Table 7 found in the PDF.", file=log_file)
                return pd.DataFrame()

            first_page = table7_pages[0]
            last_page = table7_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1

            print(f"Table 7 starts on page {scrape_start + 1} and ends on page {scrape_end}", file=log_file)

            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                })

                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue

                    table_bbox = table.bbox
                    title_bbox = (0, 0, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None

                    if title_text:
                        title_lines = title_text.split('\n')[::-1]
                        for line in title_lines:
                            line = line.strip()
                            match = re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*[:\-\s]*(.*)", line, re.IGNORECASE)
                            if match:
                                table_title = match.group(2).strip()
                                break

                    if table_title:
                        # New Table 7 detected
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New Table 7 detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)

                        headers = clean_column_headers(tab[0])
                        data_rows = tab[1:]

                        # Create DataFrame for new table
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                        if df_new.empty:
                            print(f"The extracted DataFrame for 'Area Delivery Network Upgrade' is empty. Skipping this table.", file=log_file)
                            continue


                        # Handle new ADNU tables (grouping logic)
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    # Group all adnu rows into one 'upgrade' row
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()

                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })

                                    for col, value in other_columns.items():
                                        df_grouped[col] = value

                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    # If 'type of upgrade' exists, just rename adnu if needed
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row none, original logic replaced only first row if needed
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' first row for new ADNU table.", file=log_file)
                        else:
                            # Non-ADNU new tables
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                # If exist and none in first row original logic replaced only first row
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for first row in new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase

                        if df_new.columns.duplicated().any():
                            print("Duplicate columns detected in new table. Dropping duplicates.", file=log_file)
                            df_new = df_new.loc[:, ~df_new.columns.duplicated()]

                        extracted_tables.append(df_new)
                    else:
                        # Continuation Table
                        if specific_phrase is None:
                            print(f"No previous Table 7 title found for continuation on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue

                        print(f"Continuation Table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab

                        # Check if the number of columns matches
                        expected_columns = len(extracted_tables[-1].columns) if extracted_tables else None
                        if expected_columns is None:
                            print(f"No existing table to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.")
                            continue  # No table to continue with

                        # Define expected columns based on the last extracted table
                        expected_headers = extracted_tables[-1].columns.tolist()

                        # Detect header row in continuation
                        header_keywords = ["type of upgrade", "adnu"]
                        first_continuation_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            re.search(rf"\b{kw}\b", str(cell), re.IGNORECASE) for kw in header_keywords for cell in first_continuation_row
                        )

                        if is_header_row:
                            print(f"Detected header row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            actual_header_row = data_rows[0]
                            actual_headers = clean_column_headers(actual_header_row)
                             

                            # Go back to previous page bounding box to update table title if needed
                            if page_number > 0:
                                previous_page = pdf.pages[page_number - 1]
                                bbox_lower_region = (0, table_bbox[1], previous_page.width, previous_page.height)
                                title_text_previous = previous_page.within_bbox(bbox_lower_region).extract_text() or ""
                                new_table_title = None
                                if title_text_previous:
                                    title_lines_prev = title_text_previous.split('\n')[::-1]
                                    for line in title_lines_prev:
                                        line = line.strip()
                                        match_prev = re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*[:\-\s]*(.*)", line, re.IGNORECASE)
                                        if match_prev:
                                            new_table_title = match_prev.group(2).strip()
                                            break
                                if new_table_title:
                                    specific_phrase = extract_specific_phrase(new_table_title)
                                    print(f"Updated table title from previous page: '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                else:
                                    print("No table title found in previous page region. Using existing specific_phrase.", file=log_file)
                            else:
                                print("No previous page available for title extraction for continuation table. Using existing specific_phrase.", file=log_file)



                        
                            # Handle continuation ADNU or non-ADNU
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                    
                                print("Continuation ADNU table detected. No grouping, just rename and type of upgrade handling.", file=log_file)
                                  
                                    
                                #if "adnu" in data_rows[0]:
                                    #
                                 #   print("Handling continuation for 'Area Delivery Netowrk Upgrade' table")
                                 #   print("Renaming 'ADNU' to 'upgrade' in continuation table.")
                                 #   
                                 #   data_rows = [ ["upgrade"] + row[1:] for row in data_rows ]
                                    

                                if "type of upgrade" not in data_rows[0]:
                                    # Insert 'type of upgrade' column at the beginning
                                    print("Inserting 'type of upgrade' column with specific phrase in continuation table.",file=log_file)
                                    data_rows = [ [specific_phrase] + row for row in data_rows ]


                                    if "ADNU" in data_rows[0]:
                                        print("Handling continuation for 'Area Delivery Network Upgrade' table",file=log_file)
                                        print("Renaming 'ADNU' to 'upgrade' in continuation table.",file=log_file)
                                        # Find the index where "ADNU" occurs in the first row
                                        adnu_idx = data_rows[0].index("ADNU")
                                        # Replace "ADNU" with "upgrade" in that column for every row
                                        #for r in range(len(data_rows)):
                                        data_rows[0][adnu_idx] = "upgrade"



                                    
                                
                        # Handle missing or extra columns
                        if len(data_rows[0]) < expected_columns:
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                    # For ADNU tables, assume missing "upgrade" column
                                print(f"Detected missing 'upgrade' column in continuation table on page {page_number + 1}, table {table_index + 1}. Inserting 'upgrade' column.",file=log_file)
                                data_rows = [[specific_phrase] + row for row in data_rows]
                            else:
                                    # For other tables, assume missing "Type of Upgrade" column
                                print(f"Detected missing 'Type of Upgrade' column in continuation table on page {page_number + 1}, table {table_index + 1}. Inserting 'Type of Upgrade' column.",file=log_file)
                                data_rows = [[specific_phrase] + row for row in data_rows]
                        elif len(data_rows[0]) > expected_columns:
                            # Extra columns detected; adjust accordingly
                            print(f"Detected extra columns in continuation table on page {page_number + 1}, table {table_index + 1}. Dropping extra columns.",file=log_file)
                            data_rows = [row[:expected_columns] for row in data_rows]
                            
                        


                        

                        # Create DataFrame for the continuation table

                        if is_header_row:    
                            data_rows = data_rows[1:]
                            print(f"Dropped header row from data_rows after modifications for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
    
                        try:
                            df_continuation = pd.DataFrame(data_rows, columns=expected_headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}")
                            continue  # Skip this table due to error

                        if df_continuation.empty:
                            print(f"The continuation DataFrame on page {page_number + 1}, table {table_index + 1} is empty. Skipping this table.", file=log_file)
                            continue

                            # Special Handling for "Area Delivery Network Upgrade" Tables in Continuation
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing 'None' in 'type of upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                # If "type of upgrade" column does not exist, add it
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                        else:
                            # General Handling for other tables
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing 'None' in 'Type of Upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                # If "Type of Upgrade" column does not exist, add it
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'Type of Upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)

                        # Ensure no duplicate columns
                        if df_continuation.columns.duplicated().any():
                            print(f"Duplicate columns detected in continuation table on page {page_number + 1}, table {table_index + 1}. Dropping duplicates.",file=log_file)
                            df_continuation = df_continuation.loc[:, ~df_continuation.columns.duplicated()]

                        # Merge with the last extracted table
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)            



                                         
    except Exception as e:
        print(f"Error processing Table 7 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # After processing all tables, concatenate them
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())

        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)

        print("\nConcatenating all extracted Table 7 data...", file=log_file)
        try:
            table7_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table7_data = pd.DataFrame()
    else:
        print("No Table 7 data extracted.", file=log_file)
        table7_data = pd.DataFrame()

    return table7_data


 

 

 




def extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Table 7 data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table7_data = extract_table7(pdf_path, log_file, is_addendum)

    if table7_data.empty:
        return base_data
    else:
        # Identify overlapping columns excluding 'point_of_interconnection'
        overlapping_columns = base_data.columns.intersection(table7_data.columns).difference(['point_of_interconnection'])
        table7_data = table7_data.drop(columns=overlapping_columns, errors='ignore')
        
        # Repeat base data for each row in table7_data
        base_data_repeated = pd.concat([base_data] * len(table7_data), ignore_index=True)
        
        try:
            # Concatenate base data with Table 7 data along columns
            merged_df = pd.concat([base_data_repeated, table7_data], axis=1, sort=False)
            
            # Ensure 'point_of_interconnection' is present and correctly populated
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            
            print(f"Merged base data with Table 7 data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Table 7 data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data  # Fallback to base data only



            

def check_has_table7(pdf_path):
    """Checks if the PDF contains Table 7."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*7[-.]?\d*", text, re.IGNORECASE):
                    return True
    except Exception as e:
        # Handle potential errors when opening PDF
        return False
    return False

def is_addendum(pdf_path):
    """Checks if the PDF is an addendum by searching 'Addendum' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False
            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            return "Addendum" in text
    except Exception as e:
        # Handle potential errors when opening PDF
        return False

def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped

    # Ensure the log file directory exists
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)

    with open(LOG_FILE_PATH, 'w') as log_file:
        for project_id in PROJECT_RANGE:
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue

            project_scraped = False  # Flag to track if any PDF in the project was scraped
            base_data_extracted = False
            base_data = pd.DataFrame()

            for pdf_name in os.listdir(project_path):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(project_path, pdf_name)
                    total_pdfs_accessed += 1
                    is_add = is_addendum(pdf_path)

                    # Determine output DataFrame and CSV path based on addendum status
                    if is_add:
                        addendum_pdfs.append(pdf_name)
                        print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                    else:
                        original_pdfs.append(pdf_name)
                        print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)

                    try:
                        has_table7 = check_has_table7(pdf_path)
                        if not has_table7:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)", file=log_file)
                            # Print to ipynb output
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)")
                            total_pdfs_skipped += 1
                            continue

                        if not is_add and not base_data_extracted:
                            # Extract base data from original PDF
                            base_data = extract_base_data(pdf_path, project_id, log_file)
                            base_data_extracted = True
                            print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                        if is_add and base_data_extracted:
                            # For addendums, use the extracted base data
                            table7_data = extract_table7(pdf_path, log_file, is_addendum=is_add)
                            if not table7_data.empty:
                                # Merge base data with Table 7 data
                                merged_df = pd.concat([base_data] * len(table7_data), ignore_index=True)
                                merged_df = pd.concat([merged_df, table7_data], axis=1, sort=False)
                                core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                # Optionally, print to ipynb
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                        else:
                            # For originals, extract Table 7 data
                            df = extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=is_add)
                            if not df.empty:
                                if is_add:
                                    core_addendums = pd.concat([core_addendums, df], ignore_index=True)
                                else:
                                    core_originals = pd.concat([core_originals, df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                # Optionally, print to ipynb
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1

                    except Exception as e:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                        print(traceback.format_exc(), file=log_file)
                        # Optionally, print to ipynb
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                        total_pdfs_skipped += 1

            # After processing all PDFs for this project, check if any PDF was scraped
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)


    # After processing all PDFs, save to CSV
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")

    # Calculate total projects processed
    total_projects_processed = len(scraped_projects) + len(skipped_projects)

    # Print summary to ipynb
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")

    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))

    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))

    print("\nList of Missing Projects:")
    print(sorted(missing_projects))

    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)

    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)

    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)

    print("\nList of Original PDFs:")
    print(original_pdfs)

    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return

    # Clean up the entire DataFrame by cleaning string cells
    df = df.map(clean_string_cell)

    # Drop rows that contain specific phrases (e.g., "Type of Upgrade")
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]

    # Reorder columns as specified
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")

    # Ensure q_id is numeric for sorting, replace missing values with None
    if 'q_id' in df.columns:
        df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')

    # Save the DataFrame to CSV
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Scraped PDF: Appendix A - Q945 LECU.pdf from Project 945
Scraped PDF: 13AS837386-Q946_Northern_Orchard_Solar_C6Ph_I_Appendix_AIndividual_ReportRev1.pdf from Project 946

Columns reordered for originals as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 6/03_raw/rawdata_cluster6_style_G_945.csv
No data to save for addendums.

=== Scraping Summary ===
Total Projects Processed: 2
Total Projects Scraped: 2
Total Projects Skipped: 0
Total Projects Missing: 0
Total PDFs Accessed: 2
Total PDFs Scraped: 2
Total PDFs Skipped: 0

List of Scraped Projects:
[945, 946]

List of Skipped Projects:
[]

List of Missing Projects:
[]

List of Scraped PDFs:
['Appendix A - Q945 LECU.pdf', '13AS837386-Q946_Northern_Orchard_Solar_C6Ph_I_Appendix_AIndividual_ReportRev1.pdf']

List of Skipped PDFs:
[]

List of Addendum PDFs:
[]

List of Original PDFs:
['Appendix A - Q945 LECU.pdf', '13AS837386-Q946_Northern_Orchard_Solar_C6Ph_I_Appendix_

# Creating Itemized and total datasets

# Originals

In [8]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/03_raw/rawdata_cluster6_style_G_originals.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')

def merge_columns(df):
    merge_columns_dict = {
        "escalated_cost_x_1000": [
            "escalated costs x 1000",
            "estimated cost x 1000 escalated",
            "estimated cost x 1000 escalated without itcca"
        ],
        "estimated_cost_x_1000": [
            "estimated cost x 1000",
            "allocated_cost",
        ],
        "estimated_time_to_construct": [
            "estimated time to construct",
            "estimated time  to construct"
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated"
        ],
        "total_estimated_cost_x_1000": [
            "total nu cost"
        ],
        "adnu_cost_rate_x_1000": [
            "adnu cost rate x 1000",
            "cost rate x 1000",
            "cost rate"
        ],
        "description": ["description"],
        "capacity": [
            "capacity",
            "project size",
            "project mw"

        ],
        "cost_allocation_factor": [
            "cost allocation factor",
            "cost allocatio n factor"
        ],
    }

    # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["description"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)
#df.drop('incremental mw', axis=1, inplace=True)

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        

df = reorder_columns(df)
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]

required_columns = ['type_of_upgrade', 'cost_allocation_factor']
for col in required_columns:
    if col not in df.columns:
        df[col] = None


'''
# ## ADNU GROUPING LOGIC START
# If type_of_upgrade == 'ADNU' and multiple rows have the same 'upgrade' with no estimated_cost_x_1000,
# we group them into a single row by keeping the first and dropping the rest.
if 'type_of_upgrade' in df.columns and 'upgrade' in df.columns and 'estimated_cost_x_1000' in df.columns:
    to_drop = []
    for (qid, toup, upg), grp in df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], sort=False):
        if isinstance(toup, str) and toup.lower() == 'adnu':
            # Check if all estimated_cost_x_1000 are empty/None/'None'
            is_empty_est = grp['estimated_cost_x_1000'].apply(lambda x: pd.isna(x) or x == '' or x == 'None')
            if len(grp) > 1 and is_empty_est.all():
                # Keep the first row, drop the rest
                idxs = grp.index.tolist()
                drop_idxs = idxs[1:]
                to_drop.extend(drop_idxs)
    if to_drop:
        df.drop(to_drop, inplace=True)
        df.reset_index(drop=True, inplace=True)
# ## ADNU GROUPING LOGIC END
'''

# Step 1: Create the 'item' column
df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)

# Step 2: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]

# Step 3: Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )

# Step 4: For each q_id and type_of_upgrade, if only one row and no total present, create new Total row
new_rows = []
for q_id, group in df.groupby('q_id'):
    unique_upgrades = group['type_of_upgrade'].dropna().unique()

    if any('Total' in str(x) for x in group.get('cost_allocation_factor', [])):
        continue

    for upgrade in unique_upgrades:
        if pd.isna(upgrade) or 'Total' in str(upgrade):
            continue

        rows = group[group['type_of_upgrade'] == upgrade]
        if len(rows) == 1:
            original_row = rows.iloc[0].copy()
            total_row = original_row.copy()

            total_row['type_of_upgrade'] = 'Total'
            total_row['item'] = 'no'

            original_index = df[(df['q_id'] == q_id) & (df['type_of_upgrade'] == upgrade)].index[0]
            new_rows.append((original_index + 1, total_row))

for idx, row in sorted(new_rows, reverse=True):
    if idx < 0 or idx > len(df):
        continue
    df = pd.concat([df.iloc[:idx], pd.DataFrame([row]), df.iloc[idx:]]).reset_index(drop=True)

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )

def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

mappings = {
    'Reliability Network Upgrade': 'RNU',
    'Reliability Network upgrade': 'RNU',
    'Local Delivery Network Upgrade': 'LDNU',
    'Local Delivery Network': 'LDNU',
    "PTOs Interconnection Facilities": 'PTO_IF',
    'Area Delivery Network Upgrade': 'ADNU',
    'Reliability Network upgrade to Physically Interconnect': 'RNU',
    'PTO': 'PTO_IF',
}

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\bupgrades\b', 'upgrade', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )
    df['type_of_upgrade'] = df['type_of_upgrade'].ffill()

if 'upgrade' in df.columns and 'type_of_upgrade' in df.columns and 'q_id' in df.columns:
    df['upgrade'] = df.groupby(['q_id', 'type_of_upgrade'])['upgrade'].ffill()

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])

if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/02_intermediate/costs_phase_1_cluster_6_style_G_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    existing_totals_columns = [col for col in totals_columns if col in df.columns]
    totals_df = df[df['item'] == 'no'].drop(columns=existing_totals_columns, errors='ignore')
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/02_intermediate/costs_phase_1_cluster_6_style_G_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_1_cluster_8_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_1_cluster_8_total.csv'.")

if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())


Itemized rows saved to 'costs_phase_1_cluster_8_itemized.csv'.
Filtered Total rows saved to 'costs_phase_1_cluster_8_total.csv'.
['PTO_IF' 'RNU' 'LDNU' 'ADNU']
[ 943  945  946  947  951  954  955  956  962  963  964  966  972  974
  975  976  987  988  989  992  997 1000 1001]
[6]


# Checking scraped data

In [9]:
import pandas as pd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ---------------------- Configuration ---------------------- #

# Paths to the CSV files
ITEMIZED_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6//02_intermediate/costs_phase_1_cluster_6_style_G_itemized.csv'
TOTALS_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6//02_intermediate/costs_phase_1_cluster_6_style_G_total.csv'

# Columns in totals_df that hold the reported total costs
TOTALS_ESTIMATED_COLUMN = 'estimated_cost_x_1000'
TOTALS_ESCALATED_COLUMN = 'escalated_cost_x_1000'

# Upgrade types to check
REQUIRED_UPGRADES = ['PTO_IF', 'RNU', 'LDNU', 'ADNU']

# Output paths
MISMATCHES_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/mismatches.csv'
MATCHED_QIDS_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 6/matched_qids.csv'

# ---------------------- Load Data ---------------------- #

def load_csv(path, dataset_name):
    """
    Loads a CSV file into a pandas DataFrame.
    """
    try:
        df = pd.read_csv(path)
        print(f"Loaded {dataset_name} from {path}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
        exit(1)
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        exit(1)

# Load datasets
itemized_df = load_csv(ITEMIZED_CSV_PATH, "itemized data")
totals_df = load_csv(TOTALS_CSV_PATH, "totals data")

# ---------------------- Data Cleaning ---------------------- #

def clean_text(df, column):
    """
    Cleans text data by stripping leading/trailing spaces and converting to uppercase.
    """
    if column in df.columns:
        df[column] = df[column].astype(str).str.strip().str.upper()
    else:
        print(f"Warning: '{column}' column is missing in the dataset. Filling with 'UNKNOWN'.")
        df[column] = 'UNKNOWN'
    return df

# Clean 'type_of_upgrade' and 'point_of_interconnection' in both datasets
itemized_df = clean_text(itemized_df, 'type_of_upgrade')
itemized_df = clean_text(itemized_df, 'point_of_interconnection')

totals_df = clean_text(totals_df, 'type_of_upgrade')
totals_df = clean_text(totals_df, 'point_of_interconnection')

# ---------------------- Data Preparation ---------------------- #

# Ensure necessary columns exist in itemized_df
required_itemized_columns = ['q_id', 'type_of_upgrade', 'point_of_interconnection', 'estimated_cost_x_1000', 'escalated_cost_x_1000']
for col in required_itemized_columns:
    if col not in itemized_df.columns:
        print(f"Warning: '{col}' column is missing in the itemized dataset.")
        if col in ['q_id', 'type_of_upgrade', 'point_of_interconnection']:
            itemized_df[col] = 'UNKNOWN'
        else:
            itemized_df[col] = 0

# Ensure necessary columns exist in totals_df
required_totals_columns = ['q_id', 'type_of_upgrade', 'point_of_interconnection', TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]
for col in required_totals_columns:
    if col not in totals_df.columns:
        print(f"Error: '{col}' column is missing in the totals dataset. Cannot proceed.")
        exit(1)

# Convert cost columns to numeric, coercing errors to NaN and filling with 0
cost_columns_itemized = ['estimated_cost_x_1000', 'escalated_cost_x_1000']
for col in cost_columns_itemized:
    itemized_df[col] = pd.to_numeric(itemized_df[col], errors='coerce').fillna(0)

cost_columns_totals = [TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]
for col in cost_columns_totals:
    totals_df[col] = pd.to_numeric(totals_df[col], errors='coerce').fillna(0)

# ---------------------- Calculate Manual Totals ---------------------- #

# Group itemized data by q_id and type_of_upgrade and calculate sums
itemized_grouped = itemized_df.groupby(['q_id', 'type_of_upgrade']).agg({
    'estimated_cost_x_1000': 'sum',
    'escalated_cost_x_1000': 'sum'
}).reset_index()

# Apply preference: Use estimated_cost_x_1000 if sum > 0, else use escalated_cost_x_1000
itemized_grouped['manual_total'] = itemized_grouped.apply(
    lambda row: row['estimated_cost_x_1000'] if row['estimated_cost_x_1000'] > 0 else row['escalated_cost_x_1000'],
    axis=1
)

# ---------------------- Prepare Totals Data ---------------------- #

# Group totals data by q_id and type_of_upgrade and calculate sums
totals_grouped = totals_df.groupby(['q_id', 'type_of_upgrade']).agg({
    TOTALS_ESTIMATED_COLUMN: 'sum',
    TOTALS_ESCALATED_COLUMN: 'sum'
}).reset_index()

# Apply preference: Use estimated_cost_x_1000 if sum > 0, else use escalated_cost_x_1000
totals_grouped['reported_total'] = totals_grouped.apply(
    lambda row: row[TOTALS_ESTIMATED_COLUMN] if row[TOTALS_ESTIMATED_COLUMN] > 0 else row[TOTALS_ESCALATED_COLUMN],
    axis=1
)

# ---------------------- Merge Data ---------------------- #

# Merge the itemized and totals data on q_id and type_of_upgrade
comparison_df = pd.merge(
    itemized_grouped,
    totals_grouped[['q_id', 'type_of_upgrade', 'reported_total']],
    on=['q_id', 'type_of_upgrade'],
    how='left'
)

# ---------------------- Check for Missing Upgrades ---------------------- #

# Identify q_ids that are missing any of the required upgrades
missing_upgrades_report = []
for q_id in comparison_df['q_id'].unique():
    upgrades_present = comparison_df[comparison_df['q_id'] == q_id]['type_of_upgrade'].unique()
    missing_upgrades = [upgrade for upgrade in REQUIRED_UPGRADES if upgrade not in upgrades_present]
    if missing_upgrades:
        missing_upgrades_report.append((q_id, missing_upgrades))

# Report missing upgrades
if missing_upgrades_report:
    print("\nQ_ids with missing upgrades:")
    for q_id, missing in missing_upgrades_report:
        print(f"  Q_id {q_id} is missing upgrades: {', '.join(missing)}")
else:
    print("\nAll q_ids have all required upgrades.")

# ---------------------- Compare Totals and Identify Mismatches ---------------------- #

# Initialize list to store mismatches
mismatches = []

# Iterate through each row to compare manual_total with reported_total
for index, row in comparison_df.iterrows():
    q_id = row['q_id']
    upgrade = row['type_of_upgrade']
    manual_total = row['manual_total']
    reported_total = row['reported_total']
    
    # Determine if both manual_total and reported_total are zero
    if manual_total == 0.0 and reported_total == 0.0:
        continue  # No mismatch
    # Determine if manual_total is zero and reported_total is missing or zero
    elif manual_total == 0.0 and (pd.isna(row['reported_total']) or reported_total == 0.0):
        continue  # No mismatch
    # If reported_total is missing (NaN) and manual_total is not zero
    elif pd.isna(row['reported_total']) and manual_total != 0.0:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: Missing")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': 'Missing'
        })
    # If manual_total is not zero and reported_total is zero
    elif manual_total != 0.0 and reported_total == 0.0:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: 0.0")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': reported_total
        })
    # If both totals are non-zero but differ beyond tolerance
    elif abs(manual_total - reported_total) > 1e-2:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: {reported_total}")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': reported_total
        })
    # Else, totals match; do nothing

# Create a DataFrame for mismatches
mismatches_df = pd.DataFrame(mismatches, columns=['q_id', 'type_of_upgrade', 'manual_total', 'reported_total'])

# Save mismatches to a CSV file
try:
    mismatches_df.to_csv(MISMATCHES_CSV_PATH, index=False)
    print(f"\nMismatches saved to '{MISMATCHES_CSV_PATH}'.")
except Exception as e:
    print(f"Error saving mismatches CSV: {e}")

# ---------------------- Point of Interconnection Matching ---------------------- #

# Extract unique q_id and point_of_interconnection from itemized dataset
itemized_poi = itemized_df[['q_id', 'point_of_interconnection']].drop_duplicates()

# Extract unique q_id and point_of_interconnection from totals dataset
totals_poi = totals_df[['q_id', 'point_of_interconnection']].drop_duplicates()

# Merge both to have a complete list of q_id and point_of_interconnection
all_poi = pd.concat([itemized_poi, totals_poi]).drop_duplicates().reset_index(drop=True)

# ---------------------- Direct Match Identification ---------------------- #

# Group by point_of_interconnection to find q_ids sharing the same point_of_interconnection
direct_matches = all_poi.groupby('point_of_interconnection')['q_id'].apply(list).reset_index()

# Filter groups with more than one q_id (i.e., shared points_of_interconnection)
direct_matches = direct_matches[direct_matches['q_id'].apply(len) > 1]

print("\nDirect Matches (Exact Point of Interconnection Names):")
if not direct_matches.empty:
    print(direct_matches)
else:
    print("No direct matches found.")

# ---------------------- Fuzzy Match Identification ---------------------- #

# Prepare list of points_of_interconnection for fuzzy matching
poi_list = all_poi['point_of_interconnection'].unique().tolist()

# Initialize list to store fuzzy matches
fuzzy_matches = []

# Iterate through each point_of_interconnection to find similar ones
for i, poi in enumerate(poi_list):
    # Compare with the rest of the points to avoid redundant comparisons
    similar_pois = process.extract(poi, poi_list[i+1:], scorer=fuzz.token_set_ratio)
    
    # Filter matches with similarity >= 80%
    for match_poi, score in similar_pois:
        if score >= 80:
            # Retrieve q_ids for both points_of_interconnection
            qids_poi1 = all_poi[all_poi['point_of_interconnection'] == poi]['q_id'].tolist()
            qids_poi2 = all_poi[all_poi['point_of_interconnection'] == match_poi]['q_id'].tolist()
            
            # Append the matched pairs with their points_of_interconnection and similarity score
            fuzzy_matches.append({
                'point_of_interconnection_1': poi,
                'q_ids_1': qids_poi1,
                'point_of_interconnection_2': match_poi,
                'q_ids_2': qids_poi2,
                'similarity_score': score
            })

# Convert fuzzy matches to DataFrame
fuzzy_matches_df = pd.DataFrame(fuzzy_matches)

print("\nFuzzy Matches (>=80% Similarity in Point of Interconnection):")
if not fuzzy_matches_df.empty:
    print(fuzzy_matches_df)
else:
    print("No fuzzy matches found.")

# ---------------------- Save Matched Q_ids to CSV ---------------------- #

# For clarity, create a combined DataFrame for direct and fuzzy matches

# Direct matches: list each pair of q_ids sharing the same point_of_interconnection
direct_matches_expanded = []
for _, row in direct_matches.iterrows():
    qids = row['q_id']
    poi = row['point_of_interconnection']
    # Generate all possible unique pairs
    for i in range(len(qids)):
        for j in range(i+1, len(qids)):
            direct_matches_expanded.append({
                'match_type': 'Direct',
                'point_of_interconnection_1': poi,
                'q_id_1': qids[i],
                'point_of_interconnection_2': poi,
                'q_id_2': qids[j],
                'similarity_score': 100
            })

# Fuzzy matches: already have pairs
fuzzy_matches_expanded = []
for _, row in fuzzy_matches_df.iterrows():
    fuzzy_matches_expanded.append({
        'match_type': 'Fuzzy',
        'point_of_interconnection_1': row['point_of_interconnection_1'],
        'q_id_1': row['q_ids_1'],
        'point_of_interconnection_2': row['point_of_interconnection_2'],
        'q_id_2': row['q_ids_2'],
        'similarity_score': row['similarity_score']
    })

# Convert to DataFrame
matched_qids_df = pd.DataFrame(direct_matches_expanded + fuzzy_matches_expanded)

# Save matched q_ids to CSV
try:
    matched_qids_df.to_csv(MATCHED_QIDS_CSV_PATH, index=False)
    print(f"Matched Q_ids saved to '{MATCHED_QIDS_CSV_PATH}'.")
except Exception as e:
    print(f"Error saving matched Q_ids CSV: {e}")

# ---------------------- Summary ---------------------- #

# Print a summary
total_checked = len(comparison_df)
total_mismatches = len(mismatches_df)
print(f"\nTotal checks performed: {total_checked}")
print(f"Total mismatches found: {total_mismatches}")


Loaded itemized data from /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 6//02_intermediate/costs_phase_1_cluster_6_style_G_itemized.csv
Loaded totals data from /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 6//02_intermediate/costs_phase_1_cluster_6_style_G_total.csv

All q_ids have all required upgrades.

Mismatches saved to '/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 6/mismatches.csv'.

Direct Matches (Exact Point of Interconnection Names):
No direct matches found.

Fuzzy Matches (>=80% Similarity in Point of Interconnection):
             point_of_interconnection_1 q_ids_1  \
0               LERDO 115 KV SUBSTATION   [943]   
1  MIDWAY  WHEELER RIDGE 230 KV LINE #2   [946]   
2                 VASCO 60KV SUBSTATION   [951]   
3                 VASCO 60KV SUBSTATION   [951]   
4                HALSEY 60KV SUBSTATION   [962]   
5              MIDWAY-MIDSUN 115KV LINE   [966]   

         point_of_