In [1]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback
import inflect

# Define paths and project range
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/rawdata_cluster5_style_E_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/rawdata_cluster5_style_E_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/scraping_cluster5_style_E_log.txt"
PROJECT_RANGE = range(869, 943)  # Example range for q_ids in Clusters 5

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0

def clean_column_headers(headers):
    """Cleans column headers by normalizing, removing unwanted characters, and singularizing words."""
    p = inflect.engine()
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
            # Correct any mis‐spellings of “type of upgrade”
            header = re.sub(r'\btype of upgr\s*ade\b', 'type of upgrade', header)
            words = header.split()
            singular_words = [p.singular_noun(word) if p.singular_noun(word) else word for word in words]
            header = " ".join(singular_words)
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    return value

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "ADNU",
        "LDNU",
        "RNU"
    ]
    for phrase in phrases:
        if re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback if none found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]
    existing_desired = [col for col in desired_order if col in df.columns]
    remaining = [col for col in df.columns if col not in existing_desired]
    new_order = existing_desired + remaining
    df = df[new_order]
    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()
    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()
    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"
        longitude = lon if "E" in text.upper() else f"-{lon}"
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)
    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def adjust_rows_length(data_rows, headers):
    """
    Ensures each row in data_rows has exactly len(headers) columns.
    If a row is too short, it is padded with empty strings.
    If too long, it is truncated.
    """
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""] * (col_count - len(row)))

def extract_table2(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table B.1 in the provided PDF.
    (Note: This function now searches for "Table B.1" rather than Table 2.)
    """
    print(f"\nProcessing {pdf_path} for Table B.1 extraction...", file=log_file)
    point_of_interconnection = None
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)
    table_settings_list = [
        {"horizontal_strategy": "text", "vertical_strategy": "lines", "snap_tolerance": 1},
        {"horizontal_strategy": "lines", "vertical_strategy": "lines", "snap_tolerance": 2},
    ]
    try:
        with pdfplumber.open(pdf_path) as pdf:
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                # Look only for "Table B.1"
                if re.search(r"Table\s*B\.1\b", text, re.IGNORECASE):
                    table1_pages.append(i)
            if not table1_pages:
                print("No Table B.1 found in the PDF.", file=log_file)
                return None
            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1
            print(f"Table B.1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)
            extraction_successful = False
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table B.1...", file=log_file)
                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)
                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue
                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)
                        for row_index, row in enumerate(tab, start=1):
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    poi_col_index = cell_index
                                    adjacent_col_index = poi_col_index + 1
                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            poi_value_parts = []
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)
                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)
                                            for scan_row_index in range(start_scan, end_scan):
                                                if scan_row_index == current_row_idx:
                                                    continue
                                                scan_row = tab[scan_row_index]
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue
                                            if poi_value_parts:
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                            if extraction_successful:
                                break
                        if extraction_successful:
                            break
                    if extraction_successful:
                        break
                if extraction_successful:
                    break
    except Exception as e:
        print(f"Error processing Table B.1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None
    if not extraction_successful:
        if point_of_interconnection is not None and point_of_interconnection != "":
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            print("Point of Interconnection not found in Table B.1.", file=log_file)
            return None
    return point_of_interconnection

def fix_column_names(columns):
    """
    Renames duplicate and empty column names.
    Duplicate names are suffixed with _1, _2, etc.
    Empty or whitespace-only names are replaced with unnamed_1, unnamed_2, etc.
    """
    new_cols = []
    counts = {}
    unnamed_count = 1
    for col in columns:
        # Treat empty or whitespace-only names as unnamed.
        if not col or col.strip() == "":
            new_col = f"unnamed_{unnamed_count}"
            unnamed_count += 1
        else:
            new_col = col.strip()
        if new_col in counts:
            new_col_with_suffix = f"{new_col}_{counts[new_col]}"
            counts[new_col] += 1
            new_cols.append(new_col_with_suffix)
        else:
            counts[new_col] = 1
            new_cols.append(new_col)
    return new_cols

def post_process_columns(df, log_file):
    """
    Post-processes DataFrame column names:
      1. For any column named 'unnamed_#' (or empty), look at its first non-empty cell.
         If that cell is not a dollar amount and it contains 2 or 3 words, then rename the column to that value.
      2. If a column is named "Needed For", then rename it to "description" (merging with an existing description column if necessary).
    """
    # Process unnamed columns.
    for col in list(df.columns):
        if col.lower().startswith("unnamed_") or col.strip() == "":
            # Find the first non-empty cell in this column.
            first_non_empty = None
            for val in df[col]:
                cell_val = ""
                if isinstance(val, str):
                    cell_val = val.strip()
                elif val is not None:
                    cell_val = str(val).strip()
                if cell_val:
                    first_non_empty = cell_val
                    break
            if first_non_empty:
                # Check if the value is a dollar amount.
                if not re.match(r"^\$\s*\d+(?:,\d{3})*(?:\.\d+)?$", first_non_empty):
                    words = first_non_empty.split()
                    if 2 <= len(words) <= 3:
                        new_name = clean_column_headers([first_non_empty])[0]
                        log_file.write(f"Renaming column '{col}' to '{new_name}' based on first non-empty value '{first_non_empty}'.\n")
                        if new_name in df.columns and new_name != col:
                            for idx in df.index:
                                existing_val = df.at[idx, new_name]
                                candidate_val = df.at[idx, col]
                                if (pd.isna(existing_val) or existing_val == "") and (not pd.isna(candidate_val) and candidate_val != ""):
                                    df.at[idx, new_name] = candidate_val
                            df.drop(columns=[col], inplace=True)
                        else:
                            df.rename(columns={col: new_name}, inplace=True)
    # Process "Needed For" column.
    if "Needed For" in df.columns:
        if "description" in df.columns:
            log_file.write("Merging 'Needed For' column into existing 'description' column.\n")
            for idx in df.index:
                desc_val = df.at[idx, "description"]
                needed_for_val = df.at[idx, "Needed For"]
                if (pd.isna(desc_val) or desc_val == "") and (not pd.isna(needed_for_val) and needed_for_val != ""):
                    df.at[idx, "description"] = needed_for_val
            df.drop(columns=["Needed For"], inplace=True)
        else:
            log_file.write("Renaming 'Needed For' column to 'description'.\n")
            df.rename(columns={"Needed For": "description"}, inplace=True)
    return df

def extract_table3(pdf_path, log_file, is_addendum=False):
    """
    Extracts the Attachment table data from the provided PDF.
    Now the function looks for pages containing the attachment heading ("Attachment 1" or "Attachment 2")
    and for each table it determines a subheading from the region between the attachment (or the previous table)
    and the current table bounding box. That subheading is used to extract the specific phrase.
    All other table cleaning logic remains the same.
    """
    print(f"\nProcessing {pdf_path} for Attachment table extraction...", file=log_file)
    extracted_tables = []
    specific_phrase = None
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain either Attachment 1 or Attachment 2.
            attachment_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Attachment\s*[12]\b", text, re.IGNORECASE):
                    attachment_pages.append(i)
            if not attachment_pages:
                print("No Attachment data found in the PDF.", file=log_file)
                return pd.DataFrame()
            first_page = attachment_pages[0]
            last_page = attachment_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1
            print(f"Candidate pages start on {scrape_start + 1} and end on {scrape_end}", file=log_file)
            # Process each page that might contain attachment table data.
            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                # This variable keeps track of the bottom y-coordinate of the previous table on the page.
                previous_table_bottom = None
                # Find all tables on the page.
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                })
                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue
                    table_bbox = table.bbox  # (x0, top, x1, bottom)
                    # Determine the top of the title region.
                    # First, find the bounding box of the attachment heading on the page.
                    words = page.extract_words()
                    attachment_words = [w for w in words if re.search(r"Attachment\s*[12]\b", w['text'], re.IGNORECASE)]
                    if attachment_words:
                        attachment_bottom = max(float(w['bottom']) for w in attachment_words)
                    else:
                        attachment_bottom = 0
                    # For the first table, use the attachment heading as the top boundary.
                    # For subsequent tables, use the bottom of the previous table.
                    start_y = attachment_bottom if previous_table_bottom is None else previous_table_bottom
                    title_bbox = (0, start_y, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None
                    if title_text.strip():
                        # Use the last non-empty line in the region as the table subheading.
                        title_lines = [line.strip() for line in title_text.splitlines() if line.strip()]
                        if title_lines:
                            table_title = title_lines[-1]
                    # If a subheading is found, treat this as a new table.
                    if table_title:
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New table detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        headers = clean_column_headers(tab[0])
                        # Rename header 'type' to 'type of upgrade' if needed.
                        if "type" in headers and "type of upgrade" not in headers:
                            headers = [("type of upgrade" if h == "type" else h) for h in headers]
                        if "need for" in headers:
                            headers = [("description" if h == "need for" else h) for h in headers]  
                    
                        headers = fix_column_names(headers)
                        data_rows = tab[1:]
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                        if "allocated" in df_new.columns:
                            df_new.drop(columns=["allocated"], inplace=True)
                            print(f"Dropped 'allocated' column in table on page {page_number + 1}, table {table_index}.", file=log_file)

                        if "cost rate x" in df_new.columns:
                            df_new.drop(columns=["cost rate x"], inplace=True)
                            print(f"Dropped 'cost rate x' column in table on page {page_number + 1}, table {table_index}.", file=log_file)

                        if "escalated" in df_new.columns:
                            df_new.drop(columns=["escalated"], inplace=True)
                            print(f"Dropped 'escalated' column in table on page {page_number + 1}, table {table_index}.", file=log_file)    

                        if 'type' in df_new.columns and 'type of upgrade' not in df_new.columns:
                            df_new.rename(columns={'type': 'type of upgrade'}, inplace=True)
                        # Special handling for ADNU tables if needed.
                        if re.search(r"Area\s*Delivery\s*Upgrades", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()
                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })
                                    for col, value in other_columns.items():
                                        df_grouped[col] = value
                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' for new ADNU table.", file=log_file)
                        else:
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                        df_new.columns = fix_column_names(df_new.columns.tolist())
                        df_new = post_process_columns(df_new, log_file)
                        extracted_tables.append(df_new)
                    else:
                        # If no new subheading is found, treat this table as a continuation.
                        if specific_phrase is None:
                            print(f"No previous table title found for continuation on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue
                        print(f"Continuation table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab
                        expected_columns = len(extracted_tables[-1].columns) if extracted_tables else None
                        if expected_columns is None:
                            print(f"No existing table to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue
                        expected_headers = extracted_tables[-1].columns.tolist()
                        header_keywords = ["type of upgrade", "adnu"]
                        first_continuation_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            re.search(rf"\b{kw}\b", str(cell), re.IGNORECASE) for kw in header_keywords for cell in first_continuation_row
                        )
                        if is_header_row:
                            print(f"Detected header row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            data_rows = data_rows[1:]
                        adjust_rows_length(data_rows, expected_headers)
                        try:
                            df_continuation = pd.DataFrame(data_rows, columns=expected_headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue
                        if 'type' in df_continuation.columns and 'type of upgrade' not in df_continuation.columns:
                            df_continuation.rename(columns={'type': 'type of upgrade'}, inplace=True)
                        if "need for" in df_continuation.columns:
                            df_continuation.rename(columns={"need for": "description"}, inplace=True)
                        if re.search(r"Area\s*Delivery\s*Upgrades", specific_phrase, re.IGNORECASE):
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing None in 'type of upgrade' for continuation ADNU table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation ADNU table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        else:
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing None in 'type of upgrade' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        df_continuation.columns = fix_column_names(df_continuation.columns.tolist())
                        df_continuation = post_process_columns(df_continuation, log_file)
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)
                    # Update previous_table_bottom for use in the next table.
                    previous_table_bottom = table_bbox[3]
    except Exception as e:
        print(f"Error processing Attachment tables in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())
        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)
        print("\nConcatenating all extracted Attachment data...", file=log_file)
        try:
            table3_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table3_data = pd.DataFrame()
    else:
        print("No Attachment data extracted.", file=log_file)
        table3_data = pd.DataFrame()
    return table3_data

def extract_table3_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Attachment table data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table3_data = extract_table3(pdf_path, log_file, is_addendum)
    if table3_data.empty:
        return base_data
    else:
        overlapping_columns = base_data.columns.intersection(table3_data.columns).difference(['point_of_interconnection'])
        table3_data = table3_data.drop(columns=overlapping_columns, errors='ignore')
        base_data_repeated = pd.concat([base_data] * len(table3_data), ignore_index=True)
        try:
            merged_df = pd.concat([base_data_repeated, table3_data], axis=1, sort=False)
            if "q_id" in merged_df.columns and "type of upgrade" in merged_df.columns and "upgrade" in merged_df.columns:
                non_empty_rows = merged_df[
                    merged_df["type of upgrade"].notna() & merged_df["upgrade"].notna() &
                    (merged_df["type of upgrade"].str.strip() != "") & (merged_df["upgrade"].str.strip() != "")
                ]
                grouped_df = non_empty_rows.groupby(["q_id", "type of upgrade", "upgrade"], as_index=False).first()
                merged_df["original_index"] = merged_df.index
                final_df = pd.concat([
                    grouped_df,
                    merged_df[merged_df["type of upgrade"].isna() | (merged_df["type of upgrade"].str.strip() == "") |
                              merged_df["upgrade"].isna() | (merged_df["upgrade"].str.strip() == "")]
                ], ignore_index=True, sort=False)
                final_df.sort_values(by="original_index", inplace=True)
                final_df.drop(columns=["original_index"], inplace=True)
                merged_df = final_df
                print(f"Removed duplicate rows based on 'q_id', 'type of upgrade', and 'upgrade', excluding empty rows while preserving order.", file=log_file)
            merged_df = pd.concat([base_data_repeated, table3_data], axis=1, sort=False)
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            print(f"Merged base data with Attachment data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Attachment data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data

def check_has_table3(pdf_path):
    """Checks if the PDF contains Attachment 1 or Attachment 2."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"Attachment\s*[12]\b", text, re.IGNORECASE): 
                    return True
    except Exception as e:
        return False
    return False

def is_addendum(pdf_path):
    """Checks if the PDF is an addendum by searching 'Addendum' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False
            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            return "Addendum" in text
    except Exception as e:
        return False

def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        text = clean_string_cell(text)
        queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id = queue_id.group(1) if queue_id else str(project_id)
        print(f"Extracted Queue ID: {queue_id}", file=log_file)
        cluster_number = re.search(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = cluster_number.group(1) if cluster_number else None
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)
        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)
        point_of_interconnection = extract_table2(pdf_path, log_file)
        latitude, longitude = search_gps_coordinates(text, log_file)
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }
        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)
    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return
    df = df.map(clean_string_cell)
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")
    if 'q_id' in df.columns:
        df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
    with open(LOG_FILE_PATH, 'w') as log_file:
        for project_id in PROJECT_RANGE:
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue
            project_scraped = False
            base_data_extracted = False
            base_data = pd.DataFrame()
            for pdf_name in os.listdir(project_path):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(project_path, pdf_name)
                    total_pdfs_accessed += 1
                    is_add = is_addendum(pdf_path)
                    if is_add:
                        addendum_pdfs.append(pdf_name)
                        print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                    else:
                        original_pdfs.append(pdf_name)
                        print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)
                    try:
                        has_table3 = check_has_table3(pdf_path)
                        if not has_table3:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Attachment data)", file=log_file)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Attachment data)")
                            total_pdfs_skipped += 1
                            continue
                        if not is_add and not base_data_extracted:
                            base_data = extract_base_data(pdf_path, project_id, log_file)
                            base_data_extracted = True
                            print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)
                        if is_add and base_data_extracted:
                            table3_data = extract_table3(pdf_path, log_file, is_addendum=is_add)
                            if not table3_data.empty:
                                merged_df = pd.concat([base_data] * len(table3_data), ignore_index=True)
                                merged_df = pd.concat([merged_df, table3_data], axis=1, sort=False)
                                core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                        else:
                            df = extract_table3_and_replace_none(pdf_path, project_id, log_file, is_addendum=is_add)
                            if not df.empty:
                                if is_add:
                                    core_addendums = pd.concat([core_addendums, df], ignore_index=True)
                                else:
                                    core_originals = pd.concat([core_originals, df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                    except Exception as e:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                        print(traceback.format_exc(), file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                        total_pdfs_skipped += 1
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")
    total_projects_processed = len(scraped_projects) + len(skipped_projects)
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")
    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))
    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))
    print("\nList of Missing Projects:")
    print(sorted(missing_projects))
    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)
    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)
    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)
    print("\nList of Original PDFs:")
    print(original_pdfs)
    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Scraped PDF: 11AS763732-C5PIEastQ871AppendixA.pdf from Project 871
Scraped PDF: 11AS763774-C5PIMetroQ893AppA.pdf from Project 873
Scraped PDF: 12AS779216-Appendix_AQ877_California_Flats_Solar_C5_Ph_I_Study_Report.pdf from Project 877
Scraped PDF: 12AS779571-Appendix_AQ885_SKIC_Solar_C5_Ph_I_Final_Study_Report_CMB_31JAN2013.pdf from Project 885
Scraped PDF: 12AS780495-QC5PIEOPQ887AppendixA.pdf from Project 887
Scraped PDF: 12AS780604-QC5PINOLQ888Appendix_A.pdf from Project 888
Scraped PDF: 12AS778778-QC5PINorthernQ890Backus_Solar_EnergyAppendix_A.pdf from Project 890
Scraped PDF: 12AS782071-QC5PIEOPQ891AppendixA.pdf from Project 891
Scraped PDF: 12AS780639-QC5PIEOPQ892AppendixA.pdf from Project 892
Scraped PDF: 12AS780344-C5PIMetroQ893AppA.pdf from Project 893
Scraped PDF: 12AS779610-Appendix_AQ894_Prospect_Energy.pdf from Project 894
Scraped PDF: 12AS781853-Appendix_A__Q895_1312013.pdf from Project 895
Scraped PDF: 12AS781741-Appendix_A__Q896_1312013.pdf from Project 896
Scraped PDF: 1

In [6]:
import os
import re
import traceback
import pdfplumber
import PyPDF2
import pandas as pd

# ------------------- Configuration -------------------
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_COST = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/costdata_cluster_5_style_E.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/scraping_cluster_5_style_E_log.txt"
PROJECT_RANGE = range(869, 943)  # Original range

# Set the page number (zero-indexed) after which to start scanning for attachments.
# For example, if you want to start after page 12 (i.e. from page 13 onward), set it to 12.
ATTACHMENT_SEARCH_START_PAGE = 12

# Read the CSV file containing processed projects (with q_id column)
processed_csv_path = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/all_clusters/costs_phase_1_all_clusters_total.csv"
processed_df = pd.read_csv(processed_csv_path)
processed_q_ids = pd.to_numeric(processed_df['q_id'], errors='coerce').dropna().astype(int).unique()
projects_to_process = sorted([q_id for q_id in PROJECT_RANGE if q_id not in processed_q_ids])

# ------------------- Global Tracking Variables -------------------
scraped_projects = set()
skipped_projects = set()
multi_page_skipped_projects = set()
missing_projects = set()
total_pdfs_accessed = 0

# ------------------- Helper Function for Logging -------------------
def log_msg(msg, log_file):
    print(msg, file=log_file)
    print(msg)

# ------------------- Other Helper Functions -------------------
def clean_column_headers(headers):
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    elif value is None:
        return ""
    else:
        return str(value).replace('\n', ' ').strip()

def make_unique_headers(headers):
    seen = {}
    unique_headers = []
    for header in headers:
        if header in seen:
            seen[header] += 1
            unique_headers.append(f"{header}_{seen[header]}")
        else:
            seen[header] = 1
            unique_headers.append(header)
    return unique_headers

# ------------------- Appendix PDF Check & Base Data Extraction -------------------
def is_appendix_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            first_page_text = pdf.pages[0].extract_text() or ""
            return "Appendix A" in first_page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

def is_addendum_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            first_page_text = pdf.pages[0].extract_text() or ""
            return ("Addendum" in first_page_text) or ("Revision" in first_page_text)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

def extract_table1(pdf_path, log_file):
    point_of_interconnection = None
    poi_pattern = re.compile(r"(Point\s+of\s+Interconnection|POI)", re.IGNORECASE)
    table_settings_list = [
        {"horizontal_strategy": "text", "vertical_strategy": "lines", "snap_tolerance": 1},
        {"horizontal_strategy": "lines", "vertical_strategy": "lines", "snap_tolerance": 2}
    ]
    try:
        with pdfplumber.open(pdf_path) as pdf:
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*[ABC]\.1\b", text, re.IGNORECASE):
                    table1_pages.append(i)
            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None
            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2
            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)
            extraction_successful = False
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)
                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"Attempt {attempt} with settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1}", file=log_file)
                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty.", file=log_file)
                            continue
                        for row_index, row in enumerate(tab, start=1):
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    poi_col_index = cell_index
                                    adjacent_col_index = poi_col_index + 1
                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:
                                            point_of_interconnection = poi_value
                                            print(f"Found POI: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break
                                        else:
                                            print(f"POI label found but adjacent value empty (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                    else:
                                        print(f"POI label found but no adjacent column (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                            if extraction_successful:
                                break
                        if extraction_successful:
                            break
                    if extraction_successful:
                        break
                if extraction_successful:
                    break
    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None
    if not extraction_successful:
        if point_of_interconnection is not None and point_of_interconnection != "":
            print("POI label found but no value.", file=log_file)
            return "Value Missing"
        else:
            print("POI not found in Table 1.", file=log_file)
            return None
    return point_of_interconnection

def extract_base_data(pdf_path, project_id, log_file):
    if not is_appendix_pdf(pdf_path):
        log_msg(f"Skipping base extraction because {pdf_path} is not an Appendix A PDF.", log_file)
        return pd.DataFrame()
    log_msg("Extracting base data from Appendix A PDF...", log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        text = clean_string_cell(text)
        queue_id = str(project_id)
        log_msg(f"Extracted Queue ID: {queue_id}", log_file)
        clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = '5'
        log_msg(f"Extracted Cluster Number: {cluster_number}", log_file)
        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        log_msg(f"Extracted Deliverability Status: {deliverability_status}", log_file)
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        log_msg(f"Extracted Capacity: {capacity}", log_file)
        poi_value = extract_table1(pdf_path, log_file)
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "capacity": [capacity],
            "point_of_interconnection": [poi_value]
        }
        log_msg("Base data extracted:", log_file)
        log_msg(str(base_data), log_file)
        return pd.DataFrame(base_data)
    except Exception as e:
        log_msg(f"Error extracting base data from {pdf_path}: {e}", log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

# ------------------- New: Cost Data Extraction from Appendix A -------------------
def extract_cost_data_from_appendixA(pdf_path, project_id, log_file):
    """
    Extracts cost data tables from the Appendix A PDF.
    It looks for the page where "Attachment 1" appears (inclusive) and then extracts
    all tables until the page where "Attachment 2" appears (exclusive).
    Only pages after ATTACHMENT_SEARCH_START_PAGE are examined.
    If any table appears to be spread over more than one page (determined by a header
    that appears on different pages), the project is skipped and noted.
    The extracted tables are merged with the base data from the same PDF.
    """
    log_msg(f"Extracting cost data tables from {pdf_path} for project {project_id}...", log_file)
    base_data_df = extract_base_data(pdf_path, project_id, log_file)
    if base_data_df.empty:
        return pd.DataFrame()
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            start_page = None
            end_page = None
            # Only check pages after ATTACHMENT_SEARCH_START_PAGE
            for i in range(ATTACHMENT_SEARCH_START_PAGE, len(pdf.pages)):
                text = pdf.pages[i].extract_text() or ""
                if start_page is None and "Attachment 1" in text:
                    start_page = i
                    log_msg(f"Found 'Attachment 1' header on page {i+1}", log_file)
                if "Attachment 2" in text:
                    end_page = i
                    log_msg(f"Found 'Attachment 2' header on page {i+1}", log_file)
                    break
            if start_page is None:
                log_msg("Attachment 1 header not found in PDF.", log_file)
                return pd.DataFrame()
            if end_page is None:
                end_page = len(pdf.pages)
                log_msg("Attachment 2 header not found; processing until end of document.", log_file)
            
            cost_tables = []
            # We'll use a dictionary to track headers and the page where they were seen.
            headers_seen = {}
            multi_page_flag = False

            # Process pages from start_page up to (but not including) end_page
            for i in range(start_page, end_page):
                page = pdf.pages[i]
                tables = page.extract_tables()
                for table in tables:
                    if table and any(any(cell and cell.strip() for cell in row) for row in table):
                        # Assume the first row is the header.
                        header = tuple(make_unique_headers(table[0]))
                        # If we've seen this header on a different page, mark as multi-page.
                        if header in headers_seen and headers_seen[header] != i:
                            multi_page_flag = True
                        else:
                            headers_seen[header] = i
                        df = pd.DataFrame(table)
                        if not df.empty:
                            headers = df.iloc[0].tolist()
                            headers = make_unique_headers(headers)
                            df.columns = headers
                            df = df[1:].reset_index(drop=True)
                            df = df.loc[:, ~df.columns.duplicated()]
                            cost_tables.append(df)
            
            if multi_page_flag:
                log_msg("Detected a table that spans multiple pages. Skipping cost data for this project.", log_file)
                multi_page_skipped_projects.add(project_id)
                return pd.DataFrame()
            
            if cost_tables:
                cost_data_df = pd.concat(cost_tables, ignore_index=True)
                log_msg(f"Extracted {len(cost_tables)} table(s) from pages {start_page+1} to {end_page}.", log_file)
            else:
                log_msg("No tables found in the specified page range.", log_file)
                cost_data_df = pd.DataFrame()
    except Exception as e:
        log_msg(f"Error extracting cost tables from {pdf_path}: {e}", log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # Merge the cost data with the base data (repeat base data for each row in the cost table)
    if not cost_data_df.empty and not base_data_df.empty:
        repeated_base = pd.concat([base_data_df] * len(cost_data_df), ignore_index=True)
        merged_df = pd.concat([repeated_base, cost_data_df], axis=1)
    else:
        merged_df = cost_data_df
    return merged_df

# ------------------- New: Process Cost Data for a Project -------------------
def process_cost_data_for_project(project_id, log_file):
    project_folder = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
    if not os.path.exists(project_folder):
        log_msg(f"Project folder not found: {project_folder}", log_file)
        return pd.DataFrame()
    
    # Gather Appendix A PDFs (original vs. addendum)
    original_appendix_pdfs = []
    addendum_appendix_pdf = None
    for f in os.listdir(project_folder):
        if not f.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(project_folder, f)
        if is_appendix_pdf(pdf_path):
            if is_addendum_pdf(pdf_path):
                if not addendum_appendix_pdf:
                    addendum_appendix_pdf = f
            else:
                original_appendix_pdfs.append(f)
    
    if not original_appendix_pdfs:
        if addendum_appendix_pdf:
            log_msg(f"No original Appendix A PDF found for project {project_id}. Using addendum PDF for cost data extraction.", log_file)
            base_pdf = addendum_appendix_pdf
        else:
            log_msg(f"No Appendix A PDF (original or addendum) found for project {project_id}.", log_file)
            return pd.DataFrame()
    else:
        base_pdf = original_appendix_pdfs[0]
        log_msg(f"Using Appendix A PDF: {os.path.join(project_folder, base_pdf)}", log_file)
    
    pdf_path = os.path.join(project_folder, base_pdf)
    global total_pdfs_accessed
    total_pdfs_accessed += 1
    cost_data_df = extract_cost_data_from_appendixA(pdf_path, project_id, log_file)
    if cost_data_df.empty:
        log_msg(f"No cost tables extracted for project {project_id}.", log_file)
        skipped_projects.add(project_id)
    else:
        log_msg(f"Extracted cost tables for project {project_id}.", log_file)
        scraped_projects.add(project_id)
    return cost_data_df

# ------------------- CSV Saving & Summary Functions -------------------
def save_to_csv(df, output_csv_path, data_type):
    if df.empty:
        print(f"No data to save for {data_type}.")
        return
    df = df.applymap(clean_string_cell)
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

# ------------------- Main Processing Function -------------------
def process_pdfs_in_folder():
    global total_pdfs_accessed
    all_cost_data = []
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
    with open(LOG_FILE_PATH, 'w') as log_file:
        log_msg(f"Projects to process: {projects_to_process}", log_file)
        for project_id in projects_to_process:
            project_folder = os.path.join(BASE_DIRECTORY, str(project_id))
            if not os.path.exists(project_folder):
                missing_projects.add(project_id)
                log_msg(f"Project folder not found: {project_folder}", log_file)
                continue
            log_msg(f"\n--- Processing project {project_id} ---", log_file)
            cost_data_df = process_cost_data_for_project(project_id, log_file)
            if not cost_data_df.empty:
                all_cost_data.append(cost_data_df)
        
        if all_cost_data:
            combined_cost_data = pd.concat(all_cost_data, ignore_index=True)
            save_to_csv(combined_cost_data, OUTPUT_CSV_PATH_COST, "cost data")
        else:
            log_msg("\nNo cost data extracted from any project.", log_file)
        
        total_projects_processed = len(scraped_projects) + len(skipped_projects) + len(missing_projects)
        log_msg("\n=== Scraping Summary ===", log_file)
        log_msg(f"Total Projects Processed: {total_projects_processed}", log_file)
        log_msg(f"Total Projects Scraped (with cost data): {len(scraped_projects)}", log_file)
        log_msg(f"Total Projects Skipped: {len(skipped_projects)}", log_file)
        log_msg(f"Total Projects with Multi-Page Table Skipped: {len(multi_page_skipped_projects)}", log_file)
        log_msg(f"Total Projects Missing: {len(missing_projects)}", log_file)
        log_msg(f"Total PDFs Accessed: {total_pdfs_accessed}", log_file)
        log_msg("\nList of Scraped Projects: " + str(sorted(scraped_projects)), log_file)
        log_msg("\nList of Skipped Projects: " + str(sorted(skipped_projects)), log_file)
        log_msg("\nList of Projects with Multi-Page Tables Skipped: " + str(sorted(multi_page_skipped_projects)), log_file)
        log_msg("\nList of Missing Projects: " + str(sorted(missing_projects)), log_file)

# ------------------- Main -------------------
def main():
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Projects to process: [869, 870, 871, 872, 873, 874, 875, 876, 878, 879, 880, 881, 882, 883, 884, 886, 887, 888, 889, 890, 891, 892, 893, 895, 896, 897, 902, 903, 904, 905, 906, 907, 908, 909, 911, 912, 913, 914, 915, 918, 919, 920, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942]

--- Processing project 869 ---
No Appendix A PDF (original or addendum) found for project 869.

--- Processing project 870 ---
No Appendix A PDF (original or addendum) found for project 870.

--- Processing project 871 ---
Using Appendix A PDF: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data/871/02_phase_1_study/11AS763732-C5PIEastQ871AppendixA.pdf
Extracting cost data tables from /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data/871/02_phase_1_study/11AS763732-C5PIEastQ871AppendixA.pdf for project 871...
Extracting base data from Appendix A PDF...
Extracted Queue ID: 871
Extracted Cluster Number: 5
Extracted D

  df = df.applymap(clean_string_cell)


# Itemized and Total Datasets

# COlumn names

In [7]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/rawdata_cluster_5_style_E_originals.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())


 


After cleaning: ['q_id', 'cluster', 'req_deliverability', 'capacity', 'point_of_interconnection', 'type_of_upgr_ade', 'upgrade', 'description', 'estimat_ed_cost_x_1000_note', 'estimated_cost_x_1000_constant_dollar_od_year_note', 'unnamed_10', 'cost_share', 'allocated_cost_od_year_1000', 'estimated_cost_x_1000_constant_dollars_2012_note_4', 'estimated_cost_x_1000_constant_dollar_od_year_note_4', 'phase_i_incremental_mw', 'adnu_cost_rate_od_year_1000mw', 'estimated_time_to_construct_in_months_note_1_note_3', 'allocated_cost_od_year_1000', 'type', 'need_for', 'total_upgrade_cost_estimated_1000_2012_constant_dollars', 'cost_share', 'allocated_cost_responsibility_1000', 'total_upgrade_cost_estimated_1000', 'adnu_kw_rate', 'allocated_adnu_cost_1000', 'unnamed_27', 'none_2', '_2', 'none_3', 'none_4', '_3', 'none_5', 'none_6', '_4', 'none_7', 'none_8', '_5', 'none_9', 'none_10', '_6', 'allocated_cost', '_7', 'cost_rate', '_8', 'upgrade_the_pisgah_substation_to_500_kv_with_one_500230_kv_transfo

In [22]:
import pandas as pd
import re
import unicodedata
import numpy as np
# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/rawdata_cluster_5_style_E_originals.csv', dtype={'estimated_time_to_construct': str})

#df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')




def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.

df.columns = clean_column_headers(df.columns)

 



#STEP 2: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]




def move_dollar_values(df, source_column, target_column):
    """
    For each row in the DataFrame, if the value in `source_column` starts with a '$',
    move that value to `target_column` and clear the value in the source column.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      source_column (str): The column to check for values starting with '$'.
      target_column (str): The column to move the values into.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure target_column exists; if not, create it filled with empty strings.
    if target_column not in df.columns:
        df[target_column] = ""
    
    # Create a boolean mask for rows where the source column starts with '$'
    mask = df[source_column].astype(str).str.startswith('$', na=False)
    
    # Move the values: assign the source values to the target column where the mask is True.
    df.loc[mask, target_column] = df.loc[mask, source_column]
    
    # Clear the source column values for those rows (set to empty string)
    df.loc[mask, source_column] = ""
    
    return df

# Move values from 'unnamed_8' to a new column 'moved_value'
#df = move_dollar_values(df, 'none_5', 'total_estimated_costs_x_1000_escalated_constant_dollars_od_year')


#df = move_dollar_values(df, 'none_3','total_estimated_costs_x_1000_constant_dollar_2020')

def remove_dollar_values_and_fill_nan(df, column):
    """
    For each row in the DataFrame, if the value in the specified column starts with '$',
    set that value to NaN. Also, replace any empty strings in that column with NaN.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      column (str): The column to check and clean.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure the column is treated as string
    df[column] = df[column].astype(str)
    
    # Set values starting with '$' to NaN
    mask = df[column].str.startswith('$', na=False)
    df.loc[mask, column] = np.nan
    
    # Replace any remaining empty strings with NaN
    df[column] = df[column].replace("", np.nan)
    
    return df

df = remove_dollar_values_and_fill_nan(df, '_5')


def remove_non_percent_values_and_fill_nan(df, column):
    """
    For each row in the DataFrame, if the value in the specified column does not contain '%',
    set that value to NaN. Also, replace any empty strings in that column with NaN.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      column (str): The column to check and clean.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure the column is treated as string
    df[column] = df[column].astype(str)
    
    # Set values not containing '%' to NaN
    mask = ~df[column].str.contains('%', na=False)
    df.loc[mask, column] = np.nan
    
    # Replace any remaining empty strings with NaN
    df[column] = df[column].replace("", np.nan)
    
    return df

df = remove_non_percent_values_and_fill_nan(df, '_5')


 


def merge_columns(df):

    merge_columns_dict = {


  

    


 
 

 
    
        "type_of_upgrade": [

            'type_of_upgr_ade',
            'type',
            '_2',
    
       
            
           
         
            
 
             
        ],

         "description": ["description",
                         'need_for',
                         
                         
                         
                         
                         ],

         "upgrade": ["upgrade",
                     'unnamed_27',
                     '_3',
                     ],

         "cost_allocation_factor": [
             "cost_share",
             '_5',
             'cost_share', 
                                    
                                    
                                    
                                    ],


        "escalated_cost_x_1000": [
       'allocated_cost_od_year_1000',
       'allocated_cost_od_year_1000',
       'allocated_adnu_cost_1000',
       
              
            

        ],
        "estimated_cost_x_1000": [
            'allocated_cost_responsibility_1000',
             '_6',
             '_7',
 
                    
 
        ],



        "total_estimated_cost_x_1000" : [

            'estimat_ed_cost_x_1000_note',
            'estimated_cost_x_1000_constant_dollars_2012_note_4',
            'total_upgrade_cost_estimated_1000_2012_constant_dollars',
            'total_upgrade_cost_estimated_1000',
             '_4',
             'total_upgrade_cost_estimated_1000', 
            
        ],

        "total_estimated_cost_x_1000_escalated": [
            'estimated_cost_x_1000_constant_dollar_od_year_note',
            'estimated_cost_x_1000_constant_dollar_od_year_note_4',
        ],

        "estimated_time_to_construct": [
            'estimated_time_to_construct_months_note_345_9_10',
            'estimated_time_to_construct_in_months_note_1_note_3', 
 
            
        ],
       
        "capacity": [
            "capacity",
            "project size",
            "project mw",
            "mw at poi",
            'phase_i_incremental_mw',
        ],
 
        "max_time_to_construct": [
            'od_dollar_escalation_duration_months_note_345_9_10',
            'cod_dollar_escalation_duration_months_note_345_9_10'
 
            
        ]



    }

        # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["type_of_upgrade"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)
df.drop([  'unnamed_10', 'adnu_cost_rate_od_year_1000mw',   'adnu_kw_rate', 'none_2', 'none_3', 'none_4', 'none_5','none_6', 'none_7', 'none_8','none_9', 'none_10', 'allocated_cost',
         'cost_rate', '_8', 'upgrade_the_pisgah_substation_to_500_kv_with_one_500230_kv_transformer_loop_the_eldorado_lugo_500_kv_line_into_the_pisgah_500_kv_substation', 'adnu', '100', '206', '20572', '200', 
         '41144', 'cost', 'allocated'], axis=1, inplace=True)








 


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_time_to_construct",
        "max_time_to_construct",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost",
        "total_escalted_cost",
 
       
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)



df = df[~df.apply(lambda row: row.astype(str).isin(["Total Estimated Costs w ITCC x 1,000 Escalated Constant Dollar (OD Year)", "Allocated Cost (x1000) 2014 Dollars", "Eastern","Note (h)", "Element"]).any(), axis=1)]
df = df[~df.apply(lambda row: any(str(cell).startswith("Project #:") for cell in row), axis=1)]


df = df[df['type_of_upgrade'].notna() & (df['type_of_upgrade'].astype(str).str.strip() != "")]

 
def process_upgrade_columns(df):
    """
    Given a DataFrame df with a column "type_of_upgrade" that contains both group headers and upgrade data,
    this function:
      1. Inserts a new column "upgrade" as a duplicate of "type_of_upgrade" (placed immediately after it).
      2. Renames rows in "type_of_upgrade" that contain specific phrases as follows:
           - If it contains "Interconnection Facilities", rename to "PTO_IF" (or "PTO_IF Total" if "Total" is present)
           - If it contains "Reliability Network Upgrade", rename to "RNU" (or "RNU Total" if "Total" is present)
           - If it contains "Local Delivery Network Upgrades", rename to "LDNU" (or "LDNU Total" if "Total" is present)
           - If it contains "Area Deliverability Network Upgrades", rename to "ADNU" (or "ADNU Total" if "Total" is present)
           - If it contains "Distribution Upgrades", leave it as is.
      3. Creates a temporary column that only holds the header values (from rows that were detected as header rows) and forward-fills it downward.
         The forward fill stops (i.e. does not fill into a row) if that row’s original "type_of_upgrade" contains any of the "total" indicators.
      4. Replaces "type_of_upgrade" with the forward-filled header values.
      5. Drops the rows that originally were header rows.
      6. This deletes any rows which are either Total or Subtotal or Total cost assigned, the reason is some proejcts have multiple pdfs thus we rather calculate the total in the end.
      
    Returns the updated DataFrame.
    """
    import pandas as pd
    
    # 1. Create a new column "upgrade" immediately after "type_of_upgrade"
    loc = df.columns.get_loc("type_of_upgrade")
    df.insert(loc+1, "upgrade", df["type_of_upgrade"])
    
    # 2. Define a helper to rename header rows.
    def rename_header(val):
        # If the cell contains any of these phrases, rename accordingly.
        # We'll check using the substring test (case-sensitive) per your request.
        if "Interconnection Facilities" in val:
            return "PTO_IF" + (" Total" if "Total" in val else "")
        elif "Reliability Network Upgrade" in val:
            return "RNU" + (" Total" if "Total" in val else "")
        elif "Local Delivery Network Upgrades" in val:
            return "LDNU" + (" Total" if "Total" in val else "")
        elif "Area Deliverability Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        elif "Distribution Upgrades" in val:
            return val  # leave unchanged
        elif "Conditional Assigned Network Upgrades" in val:
            return  ("Total " if "Total" in val else "") + "CANU" 
        elif "Non-Allocated IRNU" in val:
            return  ("Total " if "Total" in val else "") + "Non-Allocated IRNU"
        elif "Area Delivery Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        elif "One Time Cost" in val:
            return val  # leave unchanged
        else:
            return val
    
    # 3. Identify header rows. We consider a row to be a header row if its "type_of_upgrade" cell 
    # contains any of the target phrases.
    target_phrases = [
        "Interconnection Facilities",
        "Reliability Network Upgrade",
        "Local Delivery Network Upgrades",
        "Area Deliverability Network Upgrades",
        "Distribution Upgrades",
        "Conditional Assigned Network Upgrades",
        "Non-Allocated IRNU",
        "Area Delivery Network Upgrades",
        "One Time Cost",

    ]
    # Create a boolean mask for header rows.
    header_mask = df["type_of_upgrade"].apply(lambda x: any(phrase in x for phrase in target_phrases))
    
    # Apply renaming to the header rows.
    df.loc[header_mask, "type_of_upgrade"] = df.loc[header_mask, "type_of_upgrade"].apply(rename_header)
    
    # 4. Create a temporary column 'header_temp' that holds only the header rows, then forward fill it.
    df["header_temp"] = df["type_of_upgrade"].where(header_mask)
    df["header_temp"] = df["header_temp"].ffill()
    
    # We want to stop the forward fill if we encounter a row that indicates totals.
    # Define a simple function that returns True if a cell contains "Total" or "Subtotal" or "Total cost assigned".
    def is_total_indicator(val):
        return ("Total" in val) or ("Subtotal" in val) or ("Total cost assigned" in val)
    
    # For rows that themselves are total indicators in the "upgrade" column, do not forward-fill (set header_temp to NaN)
    df.loc[df["upgrade"].apply(lambda x: is_total_indicator(x)), "header_temp"] = None
    
    # Now, replace the "type_of_upgrade" column with the forward-filled header
    df["type_of_upgrade"] = df["header_temp"]
    df.drop("header_temp", axis=1, inplace=True)
    
    # 5. Finally, drop the rows that were header rows (i.e. where header_mask is True)
    df = df[~header_mask].reset_index(drop=True)
    
    # Also, drop any rows that have an empty "type_of_upgrade"
    df = df[df["type_of_upgrade"].notna() & (df["type_of_upgrade"].str.strip() != "")]
    
    return df

 

#df = process_upgrade_columns(df)


 




 
mappings = {
 "PTO": 'PTO_IF',
 "IF": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
 "Reliability Network Upgrades": "RNU",
 "Non-Allocated IRNU": "RNU",
 "Total Non-Allocated IRNU": "Total RNU",
 'Local Delivery Network Upgrades' : 'LDNU',
 'Distribution': 'Distribution Upgrades', 
 }

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )




df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)
   


 
   
 
 


 

 

    
# Step 7: Remove $ signs and convert to numeric
import re

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries


# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000', 'total_estimated_cost_x_1000_escalated']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)


df = df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade'])   
df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",
    "One Time Costs (Note 1)",
    "(Note 1)",
    "(Note 2)"


])]

df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/rawdata_cluster_5_style_E.csv', index=False)

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000', 'total_estimated_cost_x_1000_escalated']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

                # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = group[
            (group['type_of_upgrade'] == upgrade) & (group['item'] == 'no')
        ].shape[0] > 0
        
        if total_exists:
             
            continue


 
        
        
        if not total_exists:
            # If only one row exists, duplicate it as the total row
            if len(rows) == 1:

                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate specified columns from the existing row
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns (single row, so it remains the same)
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)



 
            
            # If multiple rows exist, sum numeric columns and create a total row
            elif len(rows) > 1:
                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate the specified columns from the first row in the group
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)
 



      

if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    for col in df.columns:
        if col not in total_rows_df.columns:
            total_rows_df[col] = None
    total_rows_df = total_rows_df[df.columns]
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)

# Update 'item' column based on Total in type_of_upgrade or cost_allocation_factor
df['item'] = df.apply(
    lambda row: 'no' if (
        'Total' in str(row.get('type_of_upgrade', '')) or 
        'Total' in str(row.get('cost_allocation_factor', ''))
    ) else 'yes',
    axis=1
)


# Step 8: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]

 



def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

 
mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
'Total PTO_IF': 'PTO_IF',
'PTO_IF Total': 'PTO_IF',
 'Total RNU': 'RNU',
 'RNU Total': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
 'LDNU Total': 'LDNU',
 'Total Distribution Upgrades': 'Distribution Upgrades',
 
 'Distribution Upgrades Total': 'Distribution Upgrades',
 'Total Potential Distribution Upgrades': 'Potential Distribution Upgrades',
 'Total One Time Costs': 'One Time Costs',
 
  'Total Distribution': 'Distribution Upgrades',
}

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )



 

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
     
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
     
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)
df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",
    "Plan of Service",
    "Shared Upgrades",
    "Element",
   
    "(Note 1)",
    "(Note 2)"


])]

#df = remove_dollar_values_and_fill_nan(df, 'max_time_to_construct')
# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
 
    itemized_df = itemized_df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade']) 
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/02_intermediate/costs_phase_1_cluster_5_style_E_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    existing_totals_columns = [col for col in totals_columns if col in df.columns]
    totals_df = df[df['item'] == 'no'].drop(columns=existing_totals_columns, errors='ignore')
    # Define the cost columns.
    cost_cols = ['estimated_cost_x_1000', 'escalated_cost_x_1000']

    # Build an aggregation dictionary:
    # For columns not in grouping or cost_cols, we assume they are identical and take the first value.
    agg_dict = {col: 'first' for col in totals_df.columns 
                if col not in ['q_id', 'type_of_upgrade'] + cost_cols}

    # For the cost columns, we want to sum them.
    agg_dict.update({col: 'sum' for col in cost_cols})

    

    # Group by both q_id and type_of_upgrade using the aggregation dictionary.
    totals_df = totals_df.groupby(['q_id', 'type_of_upgrade'], as_index=False).agg(agg_dict)
    totals_df = reorder_columns(totals_df)
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/02_intermediate/costs_phase_1_cluster_5_style_E_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())



Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.
Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.
['RNU' 'LDNU' 'PTO_IF' 'ADNU' 'Distribution Upgrades']
[871 887 888 890 891 892 897 902 903 904 908 909 911 913 914 922 923 925
 926 927 942]
[5]


  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[col] = df[col].fillna(0)


In [23]:
df1 = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/02_intermediate/costs_phase_1_cluster_5_style_E_itemized.csv')
df2 = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/02_intermediate/costs_phase_1_cluster_5_style_E_total.csv')

df1.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/01_clean/costs_phase_1_cluster_5_style_E_itemized_updated.csv', index=False)
df2.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/01_clean/costs_phase_1_cluster_5_style_E_total_updated.csv', index=False)


# Skipped ones

In [16]:
import os
import re
import traceback
import pdfplumber
import PyPDF2
import pandas as pd

# ------------------- Configuration -------------------
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_COST = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/costdata_cluster_5_style_others.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 5/03_raw/scraping_cluster_5_style_others_log.txt"
PROJECT_RANGE = range(869, 943)  # Original range

# Set the page number (zero-indexed) after which to start scanning for attachments.
# For example, if you want to start after page 12 (i.e. from page 13 onward), set it to 12.
ATTACHMENT_SEARCH_START_PAGE = 18

# Read the CSV file containing processed projects (with q_id column)
processed_csv_path = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/all_clusters/costs_phase_1_all_clusters_total.csv"
processed_df = pd.read_csv(processed_csv_path)
processed_q_ids = pd.to_numeric(processed_df['q_id'], errors='coerce').dropna().astype(int).unique()
projects_to_process = sorted([q_id for q_id in PROJECT_RANGE if q_id not in processed_q_ids])

# ------------------- Global Tracking Variables -------------------
scraped_projects = set()
skipped_projects = set()
multi_page_skipped_projects = set()
missing_projects = set()
total_pdfs_accessed = 0

# ------------------- Helper Function for Logging -------------------
def log_msg(msg, log_file):
    print(msg, file=log_file)
    print(msg)

# ------------------- Other Helper Functions -------------------
def clean_column_headers(headers):
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    elif value is None:
        return ""
    else:
        return str(value).replace('\n', ' ').strip()

def make_unique_headers(headers):
    seen = {}
    unique_headers = []
    for header in headers:
        if header in seen:
            seen[header] += 1
            unique_headers.append(f"{header}_{seen[header]}")
        else:
            seen[header] = 1
            unique_headers.append(header)
    return unique_headers

# ------------------- Appendix PDF Check & Base Data Extraction -------------------
def is_appendix_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            first_page_text = pdf.pages[0].extract_text() or ""
            return "Appendix A" in first_page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

def is_addendum_pdf(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            first_page_text = pdf.pages[0].extract_text() or ""
            return ("Addendum" in first_page_text) or ("Revision" in first_page_text)
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

def extract_table1(pdf_path, log_file):
    point_of_interconnection = None
    poi_pattern = re.compile(r"(Point\s+of\s+Interconnection|POI)", re.IGNORECASE)
    table_settings_list = [
        {"horizontal_strategy": "text", "vertical_strategy": "lines", "snap_tolerance": 1},
        {"horizontal_strategy": "lines", "vertical_strategy": "lines", "snap_tolerance": 2}
    ]
    try:
        with pdfplumber.open(pdf_path) as pdf:
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*[ABC]\.1\b", text, re.IGNORECASE):
                    table1_pages.append(i)
            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None
            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2
            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)
            extraction_successful = False
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)
                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"Attempt {attempt} with settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1}", file=log_file)
                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty.", file=log_file)
                            continue
                        for row_index, row in enumerate(tab, start=1):
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    poi_col_index = cell_index
                                    adjacent_col_index = poi_col_index + 1
                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:
                                            point_of_interconnection = poi_value
                                            print(f"Found POI: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break
                                        else:
                                            print(f"POI label found but adjacent value empty (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                    else:
                                        print(f"POI label found but no adjacent column (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                            if extraction_successful:
                                break
                        if extraction_successful:
                            break
                    if extraction_successful:
                        break
                if extraction_successful:
                    break
    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None
    if not extraction_successful:
        if point_of_interconnection is not None and point_of_interconnection != "":
            print("POI label found but no value.", file=log_file)
            return "Value Missing"
        else:
            print("POI not found in Table 1.", file=log_file)
            return None
    return point_of_interconnection

def extract_base_data(pdf_path, project_id, log_file):
    if not is_appendix_pdf(pdf_path):
        log_msg(f"Skipping base extraction because {pdf_path} is not an Appendix A PDF.", log_file)
        return pd.DataFrame()
    log_msg("Extracting base data from Appendix A PDF...", log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        text = clean_string_cell(text)
        queue_id = str(project_id)
        log_msg(f"Extracted Queue ID: {queue_id}", log_file)
        clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = '5'
        log_msg(f"Extracted Cluster Number: {cluster_number}", log_file)
        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        log_msg(f"Extracted Deliverability Status: {deliverability_status}", log_file)
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        log_msg(f"Extracted Capacity: {capacity}", log_file)
        poi_value = extract_table1(pdf_path, log_file)
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "capacity": [capacity],
            "point_of_interconnection": [poi_value]
        }
        log_msg("Base data extracted:", log_file)
        log_msg(str(base_data), log_file)
        return pd.DataFrame(base_data)
    except Exception as e:
        log_msg(f"Error extracting base data from {pdf_path}: {e}", log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

# ------------------- New: Cost Data Extraction from Appendix A -------------------
def extract_cost_data_from_appendixA(pdf_path, project_id, log_file):
    """
    Extracts cost data tables from the Appendix A PDF.
    It looks for the page where "Attachment 1" appears (inclusive) and then extracts
    all tables until the page where "Attachment 2" appears (exclusive).
    Only pages after ATTACHMENT_SEARCH_START_PAGE are examined.
    If any table appears to be spread over more than one page (determined by a header
    that appears on different pages), the project is skipped and noted.
    The extracted tables are merged with the base data from the same PDF.
    """
    log_msg(f"Extracting cost data tables from {pdf_path} for project {project_id}...", log_file)
    base_data_df = extract_base_data(pdf_path, project_id, log_file)
    if base_data_df.empty:
        return pd.DataFrame()
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            start_page = None
            end_page = None
            # Only check pages after ATTACHMENT_SEARCH_START_PAGE
            for i in range(ATTACHMENT_SEARCH_START_PAGE, len(pdf.pages)):
                text = pdf.pages[i].extract_text() or ""
                if start_page is None and "Attachment 1" in text:
                    start_page = i
                    log_msg(f"Found 'Attachment 1' header on page {i+1}", log_file)
                if "Attachment 2" in text:
                    end_page = i
                    log_msg(f"Found 'Attachment 2' header on page {i+1}", log_file)
                    break
            if start_page is None:
                log_msg("Attachment 1 header not found in PDF.", log_file)
                return pd.DataFrame()
            if end_page is None:
                end_page = len(pdf.pages)
                log_msg("Attachment 2 header not found; processing until end of document.", log_file)
            
            cost_tables = []
            # We'll use a dictionary to track headers and the page where they were seen.
            headers_seen = {}
            multi_page_flag = False

            # Process pages from start_page up to (but not including) end_page
            for i in range(start_page, end_page):
                page = pdf.pages[i]
                tables = page.extract_tables()
                for table in tables:
                    if table and any(any(cell and cell.strip() for cell in row) for row in table):
                        # Assume the first row is the header.
                        header = tuple(make_unique_headers(table[0]))
                        # If we've seen this header on a different page, mark as multi-page.
                        if header in headers_seen and headers_seen[header] != i:
                            multi_page_flag = True
                        else:
                            headers_seen[header] = i
                        df = pd.DataFrame(table)
                        if not df.empty:
                            headers = df.iloc[0].tolist()
                            headers = make_unique_headers(headers)
                            df.columns = headers
                            df = df[1:].reset_index(drop=True)
                            df = df.loc[:, ~df.columns.duplicated()]
                            cost_tables.append(df)
            
            if multi_page_flag:
                log_msg("Detected a table that spans multiple pages. Skipping cost data for this project.", log_file)
                multi_page_skipped_projects.add(project_id)
                return pd.DataFrame()
            
            if cost_tables:
                cost_data_df = pd.concat(cost_tables, ignore_index=True)
                log_msg(f"Extracted {len(cost_tables)} table(s) from pages {start_page+1} to {end_page}.", log_file)
            else:
                log_msg("No tables found in the specified page range.", log_file)
                cost_data_df = pd.DataFrame()
    except Exception as e:
        log_msg(f"Error extracting cost tables from {pdf_path}: {e}", log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # Merge the cost data with the base data (repeat base data for each row in the cost table)
    if not cost_data_df.empty and not base_data_df.empty:
        repeated_base = pd.concat([base_data_df] * len(cost_data_df), ignore_index=True)
        merged_df = pd.concat([repeated_base, cost_data_df], axis=1)
    else:
        merged_df = cost_data_df
    return merged_df

# ------------------- New: Process Cost Data for a Project -------------------
def process_cost_data_for_project(project_id, log_file):
    project_folder = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
    if not os.path.exists(project_folder):
        log_msg(f"Project folder not found: {project_folder}", log_file)
        return pd.DataFrame()
    
    # Gather Appendix A PDFs (original vs. addendum)
    original_appendix_pdfs = []
    addendum_appendix_pdf = None
    for f in os.listdir(project_folder):
        if not f.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(project_folder, f)
        if is_appendix_pdf(pdf_path):
            if is_addendum_pdf(pdf_path):
                if not addendum_appendix_pdf:
                    addendum_appendix_pdf = f
            else:
                original_appendix_pdfs.append(f)
    
    if not original_appendix_pdfs:
        if addendum_appendix_pdf:
            log_msg(f"No original Appendix A PDF found for project {project_id}. Using addendum PDF for cost data extraction.", log_file)
            base_pdf = addendum_appendix_pdf
        else:
            log_msg(f"No Appendix A PDF (original or addendum) found for project {project_id}.", log_file)
            return pd.DataFrame()
    else:
        base_pdf = original_appendix_pdfs[0]
        log_msg(f"Using Appendix A PDF: {os.path.join(project_folder, base_pdf)}", log_file)
    
    pdf_path = os.path.join(project_folder, base_pdf)
    global total_pdfs_accessed
    total_pdfs_accessed += 1
    cost_data_df = extract_cost_data_from_appendixA(pdf_path, project_id, log_file)
    if cost_data_df.empty:
        log_msg(f"No cost tables extracted for project {project_id}.", log_file)
        skipped_projects.add(project_id)
    else:
        log_msg(f"Extracted cost tables for project {project_id}.", log_file)
        scraped_projects.add(project_id)
    return cost_data_df

# ------------------- CSV Saving & Summary Functions -------------------
def save_to_csv(df, output_csv_path, data_type):
    if df.empty:
        print(f"No data to save for {data_type}.")
        return
    df = df.applymap(clean_string_cell)
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

# ------------------- Main Processing Function -------------------
def process_pdfs_in_folder():
    global total_pdfs_accessed
    all_cost_data = []
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
    with open(LOG_FILE_PATH, 'w') as log_file:
        log_msg(f"Projects to process: {projects_to_process}", log_file)
        for project_id in projects_to_process:
            project_folder = os.path.join(BASE_DIRECTORY, str(project_id))
            if not os.path.exists(project_folder):
                missing_projects.add(project_id)
                log_msg(f"Project folder not found: {project_folder}", log_file)
                continue
            log_msg(f"\n--- Processing project {project_id} ---", log_file)
            cost_data_df = process_cost_data_for_project(project_id, log_file)
            if not cost_data_df.empty:
                all_cost_data.append(cost_data_df)
        
        if all_cost_data:
            combined_cost_data = pd.concat(all_cost_data, ignore_index=True)
            save_to_csv(combined_cost_data, OUTPUT_CSV_PATH_COST, "cost data")
        else:
            log_msg("\nNo cost data extracted from any project.", log_file)
        
        total_projects_processed = len(scraped_projects) + len(skipped_projects) + len(missing_projects)
        log_msg("\n=== Scraping Summary ===", log_file)
        log_msg(f"Total Projects Processed: {total_projects_processed}", log_file)
        log_msg(f"Total Projects Scraped (with cost data): {len(scraped_projects)}", log_file)
        log_msg(f"Total Projects Skipped: {len(skipped_projects)}", log_file)
        log_msg(f"Total Projects with Multi-Page Table Skipped: {len(multi_page_skipped_projects)}", log_file)
        log_msg(f"Total Projects Missing: {len(missing_projects)}", log_file)
        log_msg(f"Total PDFs Accessed: {total_pdfs_accessed}", log_file)
        log_msg("\nList of Scraped Projects: " + str(sorted(scraped_projects)), log_file)
        log_msg("\nList of Skipped Projects: " + str(sorted(skipped_projects)), log_file)
        log_msg("\nList of Projects with Multi-Page Tables Skipped: " + str(sorted(multi_page_skipped_projects)), log_file)
        log_msg("\nList of Missing Projects: " + str(sorted(missing_projects)), log_file)

# ------------------- Main -------------------
def main():
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Projects to process: [869, 870, 872, 873, 874, 875, 876, 878, 879, 880, 881, 882, 883, 884, 886, 889, 893, 895, 896, 905, 906, 907, 912, 915, 918, 919, 920, 924, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941]

--- Processing project 869 ---
No Appendix A PDF (original or addendum) found for project 869.

--- Processing project 870 ---
No Appendix A PDF (original or addendum) found for project 870.
Project folder not found: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data/872

--- Processing project 873 ---
Using Appendix A PDF: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data/873/02_phase_1_study/11AS763774-C5PIMetroQ893AppA.pdf
Extracting cost data tables from /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data/873/02_phase_1_study/11AS763774-C5PIMetroQ893AppA.pdf for project 873...
Extracting base data from Appendix A PDF...
Extracted Queue ID: 873
Extracted Cluster Number: 5
Extracted Deliverabilit