# C2 Style R- table 11.1 and/or 11.2

In [None]:
pip install pytesseract

In [None]:
import pytesseract

# Note do not run again, had to punch in Q 558 manually 

In [71]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback

# Define paths and project range
BASE_DIRECTORY ="/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_rawdata_cluster2_style_R_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_rawdata_cluster2_style_R_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_scraping_cluster2_style_R_log.txt"
PROJECT_RANGE = range(552, 609)  # Inclusive range for q_ids in Clusters 2 range(667, 860)


# Read the CSV file containing processed projects (with q_id column)
processed_csv_path = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/all_clusters/costs_phase_2_all_clusters_total.csv"  # UPDATE THIS PATH
processed_df = pd.read_csv(processed_csv_path)
# Convert q_id values to numeric then to int for filtering
processed_q_ids = pd.to_numeric(processed_df['q_id'], errors='coerce').dropna().astype(int).unique()
# Now build the list of folders to process:
projects_to_process = []
for folder in os.listdir(BASE_DIRECTORY):
    full = os.path.join(BASE_DIRECTORY, folder)
    if not os.path.isdir(full):
        continue
    m = re.match(r'^(\d+)', folder)
    if not m:
        continue
    num = int(m.group(1))
    if num in PROJECT_RANGE and num not in processed_q_ids:
        projects_to_process.append(folder)

# Sort by the integer prefix, not lexicographically as strings
projects_to_process = sorted(projects_to_process, key=lambda f: int(re.match(r'^(\d+)', f).group(1)))


print("Will process these project‐folders:")
print(projects_to_process)

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
style_n_pdfs = []  # List to track style N PDFs
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0
total_pdfs_skipped_extraction = 0
original_has_table7 = {}  # Dictionary to track if original PDFs have table7

def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters, but keeps parentheses."""
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            # collapse internal whitespace
            header = re.sub(r'\s+', ' ', header)
            # strip out everything except letters, digits, spaces, and parentheses
            header = re.sub(r'[^a-z0-9\s\(\)]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    elif value is None:
        return ""
    else:
        return str(value).replace('\n', ' ').strip()
     

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.

    Args:
        title (str): The table title string.

    Returns:
        str: The extracted specific phrase if found, else the original title.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
         
        
        "Other Potential Network Upgrade",
        "Area Delivery Network Upgrades",
        "Conditionally Assigned Network Upgrades",
        "Local Off-Peak Network Upgrade",
        "ADNU",
        "LDNU",
        "RNU"
    ]

    for phrase in phrases:
        if  re.search(rf"\b{re.escape(phrase)}\b(?=\d|\W|$)", title, re.IGNORECASE):
        
         #re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback to the entire title if no specific phrase is found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()

    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()

    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"  # Adjust latitude sign
        longitude = lon if "E" in text.upper() else f"-{lon}"  # Adjust longitude sign
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)

    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def extract_table1(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    Implements a retry mechanism with different table extraction settings if initial attempts fail.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.

    Returns:
        str: Extracted Point of Interconnection value,
             "Value Missing" if label found but no value,
             or None if not found.
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None

    # Define the regex pattern for 'Point of Interconnection' (case-insensitive)
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)

    # Define different table extraction settings to try
    table_settings_list = [
        {
            "horizontal_strategy": "text",
            "vertical_strategy": "lines",
            "snap_tolerance": 1,
        },
        {
            "horizontal_strategy": "lines",
            "vertical_strategy": "lines",
            "snap_tolerance": 2,  # Increased tolerance for retry
        }
    ]

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify all pages that contain "Table 1"
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*2[\.-]1\b", text, re.IGNORECASE):
                    table1_pages.append(i)

            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None  # Return None if no Table 1 found

            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2  # Plus one to include the next page if needed

            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)

            # Flag to indicate if extraction was successful
            extraction_successful = False

            # Iterate through the specified page range
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)

                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)

                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue  # Skip empty tables

                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)

                        # Iterate through each row in the table
                        for row_index, row in enumerate(tab, start=1):
                            # Iterate through each cell in the row
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    # Assuming the next column contains the value
                                    poi_col_index = cell_index  # 1-based index
                                    adjacent_col_index = poi_col_index + 1  # Next column

                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:  # Check if the value is not empty
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break  # Exit the cell loop
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            # Proceed to scan surrounding rows for the value
                                            poi_value_parts = []

                                            # Define the range to scan: two rows above and two rows below
                                            # Convert to 0-based index
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 2)  # Exclusive

                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)

                                            for scan_row_index in range(start_scan, end_scan):
                                                # Skip the current row where the label was found
                                                if scan_row_index == current_row_idx:
                                                    continue

                                                scan_row = tab[scan_row_index]
                                                # Ensure the adjacent column exists in the scan row
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        # If another POI label is found, skip it
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue

                                            if poi_value_parts:
                                                # Concatenate the parts to form the complete POI value
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' "
                                                      f"(Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break  # Exit the cell loop
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows "
                                                      f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                                # Do not return immediately; proceed to retry
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column "
                                              f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                        # Do not return immediately; proceed to retry
                            if extraction_successful:
                                break  # Exit the row loop
                        if extraction_successful:
                            break  # Exit the table loop
                    if extraction_successful:
                        break  # Exit the attempt loop
                if extraction_successful:
                    break  # Exit the page loop

    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None

    if not extraction_successful:
        # After all attempts, determine the appropriate return value
        if point_of_interconnection is not None and point_of_interconnection != "":
            # Label was found but no value
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            # Label not found
            print("Point of Interconnection not found in Table 1.", file=log_file)
            return None

    return point_of_interconnection

def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text

        text = clean_string_cell(text)

        queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id =  str(project_id)  # Use project_id if queue_id is not found
        print(f"Extracted Queue ID: {queue_id}", file=log_file)

        # Updated Cluster Extraction
        clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        if '2' in clusters:
            cluster_number = '2'
        elif clusters:
            cluster_number = max(clusters, key=lambda x: int(x))  # Choose the highest cluster number found
        else:
            cluster_number = '2'  # Default to 2 if not found
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)

        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)

        # Extract Capacity
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)

        # Extract Point of Interconnection
        point_of_interconnection = extract_table1(pdf_path, log_file)

        latitude, longitude = search_gps_coordinates(text, log_file)

        # Initialize base data dictionary
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }

        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)

    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()  # Return empty DataFrame on error

def adjust_rows_length(data_rows, headers):
    """Ensure each row in data_rows matches the length of headers by truncating or padding."""
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""]*(col_count - len(row)))

def extract_table7(pdf_path, log_file, is_addendum=False):
    """
    Extracts Table 11 data from the provided PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.
        is_addendum (bool): Whether the PDF is an addendum.

    Returns:
        pd.DataFrame: Extracted Table 11 data.
    """
    print(f"\nProcessing {pdf_path} for Table 11 extraction...", file=log_file)
    extracted_tables = []
    specific_phrase = None

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain "Table 11-1" to "Table 11-2" with hyphen or dot
            table7_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*11\s*[\.-]\s*[1-2]\s*:", text, re.IGNORECASE): # the \s* is to match any whitespace between the table number and the colon
                    table7_pages.append(i)

            if not table7_pages:
                print("No Table 11-1 to 11-2 found in the PDF.", file=log_file)
                return pd.DataFrame()

            first_page = table7_pages[0]
            last_page = table7_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2  # Plus two to include possible continuation

            print(f"Table 11 starts on page {scrape_start} and ends on page {scrape_end}", file=log_file)

            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                    
                })

                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue

                    table_bbox = table.bbox
                    title_bbox = (0, 0, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None

                    if title_text:
                        title_lines = title_text.split('\n')[::-1]
                        for line in title_lines:
                            line = line.strip()
                            match = re.search(r"(Modification\s+of\s+)?Table\s*11[\.-]([1-2])[:\-\s]*(.*)", line, re.IGNORECASE)
                            if match:
                                try:
                                    table_title = match.group(3).strip()  # match.group(2) is just the sub-table number ("1" or "2")   # match.group(3) is everything after "Table 11-X:", e.g. "Upgrades, Estimated Costs,
                                except IndexError:
                                    table_title = match.group(0).strip()
                                    print("Fallback to whole match for table title", file=log_file)

 
                


                    if table_title:
                        print(f"Table Title: {table_title}")
                        if re.search(r"PTO Interconnection Facilities Cost Estimate Summary", table_title, re.IGNORECASE):
                            print(f"Skipping Table 11-1 PTO on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            continue

 

                        

                        # New Table 11 detected
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New Table 11 detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)

                        headers = clean_string_cell(tab[0])
                        headers = clean_column_headers(tab[0])
                        data_rows = tab[1:]

                        # Create DataFrame for new table
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                            # ← INSERT this block:
                        if df_new.empty:
                            # store an empty DF with the right columns,
                            # so that continuation blocks can append to it
                            extracted_tables.append(pd.DataFrame(columns=headers))
                            print(f"Header-only Table 11 (‘{specific_phrase}’) detected on page {page_number+1}; waiting for continuation…", file=log_file)
                            continue

                        # Handle ADNU-specific grouping
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    # Group all adnu rows into one 'upgrade' row
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()

                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })

                                    for col, value in other_columns.items():
                                        df_grouped[col] = value

                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    # If 'type of upgrade' exists, just rename adnu if needed
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row is none, replace only first row if needed
                                if df_new.empty:
                                    # should never happen once you’ve done step 1, but safe to check
                                    continue
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' first row for new ADNU table.", file=log_file)
                        else:
                            # Non-ADNU new tables
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row is none, replace only first row if needed
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for the first row in new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase

                        # Ensure no duplicate columns
                        #if df_new.columns.duplicated().any():
                        #    print("Duplicate columns detected in new table. Dropping duplicates.", file=log_file)
                        #    df_new = df_new.loc[:, ~df_new.columns.duplicated()]

                        if df_new.columns.duplicated().any():
                            print("Duplicate columns detected in new table. Renaming instead of dropping.", file=log_file)


                            

                            # Build a new list of column names, appending _1, _2, … to repeats
                            new_cols = []
                            counts = {}  # keep track of how many times we've seen each base name
                            for orig in df_new.columns:
                                # 1) Decide on a non‐blank base name:
                                #    If `orig` is blank/None/whitespace, use "column" instead.
                                if pd.isna(orig) or str(orig).strip() == "":
                                    base = "column"
                                else:
                                    base = str(orig).strip()

                                # 2) Increment a counter for that base‐name:
                                if base not in counts:
                                    counts[base] = 0
                                    new_cols.append(base)
                                else:
                                    counts[base] += 1
                                    new_cols.append(f"{base}_{counts[base]}")
 
                            df_new.columns = new_cols


                        extracted_tables.append(df_new)
                    else:
                        # Continuation Table
                        if not extracted_tables:
                            print(f"No previous Table 11 detected to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue

                        last_table = extracted_tables[-1]
                        expected_columns = last_table.columns.tolist()

                        print(f"Continuation Table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab

                        # Check if the first row is a header row
                        #  we will treat all continuation table rows as data points
                        # without any header detection
                        # However,  checking if there is a header row first,  
                        # Detect if first row is a header
                        header_keywords = ["type of upgrade", "adnu", "MW at POI", "upgrade"]
                        first_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            any(re.search(rf"\b{kw}\b", clean_string_cell(cell).lower()) for kw in header_keywords)
                            for cell in first_row
                        )

                        if is_header_row:
                            # Handle header row in continuation table
                            headers = clean_string_cell(first_row)
                            headers = clean_column_headers(first_row)
                            data_rows = data_rows[1:]  # Exclude header row

                            # Update expected_columns by adding new columns if any
                            new_columns = [col for col in headers if col not in expected_columns]
                            if new_columns:
                                expected_columns.extend(new_columns)
                                print(f"Added new columns from continuation table: {new_columns}", file=log_file)

                            # Create a mapping of new columns to add with default NaN
                            for new_col in new_columns:
                                last_table[new_col] = pd.NA

                            # Reindex last_table to include new columns
                            last_table = last_table.reindex(columns=expected_columns)
                            extracted_tables[-1] = last_table

                            # Update 'type of upgrade' column in the first row if needed
                            if "type of upgrade" in headers:
                                type_upgrade_idx = headers.index("type of upgrade")
                                if pd.isna(data_rows[0][type_upgrade_idx]) or data_rows[0][type_upgrade_idx] == "":
                                    data_rows[0][type_upgrade_idx] = specific_phrase
                                    print(f"Replaced None in 'type of upgrade' first row for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            elif "upgrade" in headers:
                                upgrade_idx = headers.index("upgrade")
                                if pd.isna(data_rows[0][upgrade_idx]) or data_rows[0][upgrade_idx] == "":
                                    data_rows[0][upgrade_idx] = specific_phrase
                                    print(f"Replaced None in 'upgrade' first row for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            else:
                                # If 'type of upgrade' or 'upgrade' does not exist, add it
                                headers.append("type of upgrade")
                                expected_columns.append("type of upgrade")
                                for idx, row in enumerate(data_rows):
                                    data_rows[idx].append(specific_phrase)
                                print(f"Added 'type of upgrade' column and filled with '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

                            # Handle ADNU-specific logic if applicable
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                if "adnu" in headers:
                                    if "upgrade" not in headers:
                                        # Rename 'adnu' to 'upgrade'
                                        adnu_idx = headers.index("adnu")
                                        headers[adnu_idx] = "upgrade"
                                        for row in data_rows:
                                            row[adnu_idx] = " ".join([str(cell) for cell in row[adnu_idx] if pd.notna(cell)])
                                        print("Renamed 'adnu' to 'upgrade' in continuation ADNU table.", file=log_file)
                                # Ensure 'type of upgrade' column is filled
                                if "type of upgrade" not in headers:
                                    headers.append("type of upgrade")
                                    expected_columns.append("type of upgrade")
                                    for row in data_rows:
                                        row.append(specific_phrase)
                                    print("Added 'type of upgrade' column with specific phrase for continuation ADNU table.", file=log_file)

                        else:
                            # No header row detected, treat all rows as data points
                            print(f"No header row detected in continuation table on page {page_number + 1}, table {table_index + 1}. Treating all rows as data.", file=log_file)

                        # Create DataFrame for continuation table
                        if is_header_row:
                            try:
                                df_continuation = pd.DataFrame(data_rows, columns=headers)
                            except ValueError as ve:
                                print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                                continue
                        else:
                            # Create DataFrame with expected_columns
                            # Handle cases where continuation table has more columns
                            standardized_data = []
                            for row in data_rows:
                                if len(row) < len(expected_columns):
                                    # Insert 'type of upgrade' or 'upgrade' with specific_phrase
                                    if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                        # For ADNU tables, assume missing "upgrade" column
                                        missing_cols = len(expected_columns) - len(row)
                                        #row += [specific_phrase] * missing_cols
                                        data_rows = [row[:2] + [specific_phrase] + row[2:] for row in data_rows]
                                        print(f"Inserted '{specific_phrase}' for missing columns in ADNU continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                    else:
                                        # For non-ADNU tables, assume missing "type of upgrade" column
                                        missing_cols = len(expected_columns) - len(row)
                                        #row += [specific_phrase] * missing_cols
                                        data_rows = [ [specific_phrase]  for row in data_rows]
                                        print(f"Inserted '{specific_phrase}' for missing columns in non-ADNU continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                elif len(row) > len(expected_columns):
                                    # Add new columns with default names
                                    extra_cols = len(row) - len(expected_columns)
                                    for i in range(extra_cols):
                                        new_col_name = f"column{len(expected_columns) + 1 + i}"
                                        expected_columns.append(new_col_name)
                                        last_table[new_col_name] = pd.NA
                                        print(f"Added new column '{new_col_name}' for extra data in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                    row = row[:len(expected_columns)]

                                row_dict = dict(zip(expected_columns, [clean_string_cell(cell) for cell in row]))

                                # Handle 'type of upgrade' column
                                if "type of upgrade" in row_dict and (pd.isna(row_dict["type of upgrade"]) or row_dict["type of upgrade"] == ""):
                                    row_dict["type of upgrade"] = specific_phrase
                                    print(f"Replaced None in 'type of upgrade' for a row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

                                standardized_data.append(row_dict)

                            try:
                                df_continuation = pd.DataFrame(standardized_data, columns=expected_columns)
                            except ValueError as ve:
                                print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                                continue


                             # Special Handling for "Area Delivery Network Upgrade" Tables in Continuation
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                if "type of upgrade" in df_continuation.columns:
                                    first_row = df_continuation.iloc[0]
                                    if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                        print(f"Replacing 'None' in 'type of upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                        df_continuation.at[0, "type of upgrade"] = specific_phrase
                                else:
                                    # If "type of upgrade" column does not exist, add it
                                    df_continuation["type of upgrade"] = specific_phrase
                                    print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                            else:
                                # General Handling for other tables
                                if "type of upgrade" in df_continuation.columns:
                                    first_row = df_continuation.iloc[0]
                                    if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                        print(f"Replacing 'None' in 'Type of Upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                        df_continuation.at[0, "type of upgrade"] = specific_phrase
                                else:
                                    # If "Type of Upgrade" column does not exist, add it
                                    df_continuation["type of upgrade"] = specific_phrase
                                    print(f"'Type of Upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)


 

                        # Ensure no duplicate columns
                        if df_continuation.columns.duplicated().any():
                            print(f"Duplicate columns detected in continuation table on page {page_number + 1}, table {table_index + 1}. renaming duplicates.", file=log_file)
                                                        # Build a new list of column names, appending _1, _2, … to repeats
                            new_cols = []
                            counts = {}  # keep track of how many times we've seen each base name
                            for orig in df_continuation.columns:
                                # 1) Decide on a non‐blank base name:
                                #    If `orig` is blank/None/whitespace, use "column" instead.
                                if pd.isna(orig) or str(orig).strip() == "":
                                    base = "column"
                                else:
                                    base = str(orig).strip()

                                # 2) Increment a counter for that base‐name:
                                if base not in counts:
                                    counts[base] = 0
                                    new_cols.append(base)
                                else:
                                    counts[base] += 1
                                    new_cols.append(f"{base}_{counts[base]}")
 
                            df_continuation.columns = new_cols
                            #df_continuation = df_continuation.loc[:, ~df_continuation.columns.duplicated()]

                        # Merge with the last extracted table
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)
                        print(f"Appended continuation table data to the last extracted table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

    except Exception as e:
        print(f"Error processing Table 11 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # After processing all tables, concatenate them
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())

        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)

        print("\nConcatenating all extracted Table 11 data...", file=log_file)
        try:
            table7_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table7_data = pd.DataFrame()
    else:
        print("No Table 11 data extracted.", file=log_file)
        table7_data = pd.DataFrame()

    return table7_data


'''
def extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Table 11 data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table7_data = extract_table7(pdf_path, log_file, is_addendum)

    if table7_data.empty:
        return base_data
    else:
        # Identify overlapping columns excluding 'point_of_interconnection'
        overlapping_columns = base_data.columns.intersection(table7_data.columns).difference(['point_of_interconnection'])
        table7_data = table7_data.drop(columns=overlapping_columns, errors='ignore')
        
        # Repeat base data for each row in table7_data
        base_data_repeated = pd.concat([base_data] * len(table7_data), ignore_index=True)
        
        try:
            # Concatenate base data with Table 11 data along columns
            merged_df = pd.concat([base_data_repeated, table7_data], axis=1, sort=False)
            
            # Ensure 'point_of_interconnection' is present and correctly populated
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            
            print(f"Merged base data with Table 11 data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Table 11 data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data  # Fallback to base data only
'''

def extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """
    Extracts Table 11 data and merges with base data.
    Returns:
      df       – either base_data or base_data×Table4 rows merged
      status   – one of "no_marker", "failed", or "success"
    """
    # 1) Pull out base data
    base_data   = extract_base_data(pdf_path, project_id, log_file)
    # 2) Did we even see a Table 11 marker in the text?
    has_marker  = check_has_table7(pdf_path)
    if not has_marker:
        print(f"No Table 11 marker found in {os.path.basename(pdf_path)}; skipping extraction.", 
              file=log_file)
        return base_data, "no_marker"

    # 2) Try to scrape Table 11
    table7_data = extract_table7(pdf_path, log_file, is_addendum)
    if table7_data.empty:
        print(f"Table 11 marker found in {os.path.basename(pdf_path)}, "
              f"but extraction returned empty DataFrame.", file=log_file)
        return base_data, "failed"

    # 2) We got actual rows → merge and return
    #    Drop any overlapping columns first
    overlapping = base_data.columns.intersection(table7_data.columns)
    if not overlapping.empty:
        table7_data = table7_data.drop(columns=overlapping, errors="ignore")

    #    Repeat base_data for each row of table7_data
    base_rep   = pd.concat([base_data] * len(table7_data), ignore_index=True)
    merged_df  = pd.concat([base_rep, table7_data.reset_index(drop=True)], axis=1, sort=False)

    print(f"Merged base data with {len(table7_data)} row(s) of Table 11 for "
          f"{os.path.basename(pdf_path)}.", file=log_file)
    return merged_df, "success"





def check_has_table7(pdf_path):
    """Checks if the PDF contains Table 11-1 to 2-2."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"Table\s*11[\.-][1-2]\b", text, re.IGNORECASE):
                    return True
    except Exception as e:
        # Handle potential errors when opening PDF
        return False
    return False

def has_network_upgrade_type_column(pdf_path, log_file):
    """Checks if any table in the PDF has a column header 'Network Upgrade Type'."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.find_tables()
                for table_index, table in enumerate(tables, start=1):
                    tab = table.extract()
                    if not tab:
                        continue
                    headers = clean_column_headers(tab[0])
                    if "network upgrade type" in headers:
                        print(f"Found 'Network Upgrade Type' in PDF {pdf_path} on page {page_number}, table {table_index}.", file=log_file)
                        return True
    except Exception as e:
        print(f"Error checking 'Network Upgrade Type' in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
    return False

 

def is_addendum(pdf_path, log_file):
    """Checks if the PDF is an addendum by searching for 'Addendum', 'Addendum #3', or 'Revision' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False

            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            print(f"Extracted Text: {text}", file=log_file)  # Debug

            # Compile a pattern that matches:
            #   • “Addendum” or “ADDENDUM”
            #   • optionally followed by whitespace, a ‘#’, then digits (e.g. “Addendum #3”)
            #   • OR the word “Revision”
            pattern = re.compile(
                r"\b(?:addendum(?:\s*#\s*\d+)?|revision)\b",
                re.IGNORECASE
            )

            return bool(pattern.search(text))
    except Exception:
        return False


def make_unique_headers(headers):
    """
    Appends a suffix to duplicate headers to make them unique.

    Args:
        headers (list): List of column headers.

    Returns:
        list: List of unique column headers.
    """
    seen = {}
    unique_headers = []
    for header in headers:
        if header in seen:
            seen[header] += 1
            unique_headers.append(f"{header}_{seen[header]}")
        else:
            seen[header] = 1
            unique_headers.append(header)
    return unique_headers

def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped, total_pdfs_skipped_extraction

    SKIP_PROJECTS = {1860, 2003, 2006}

    # Ensure the log file directory exists
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)

    with open(LOG_FILE_PATH, 'w') as log_file:


        for project_id in projects_to_process:
            
            # Skip the projects in the SKIP_PROJECTS set
            if project_id in SKIP_PROJECTS:
                print(f"Skipping Project {project_id} (marked to skip)", file=log_file)
                continue

         
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "03_phase_2_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue

            project_scraped = False  # Flag to track if any PDF in the project was scraped
            base_data_extracted = False
            base_data = pd.DataFrame()

            # **START OF CHANGES**
            # Separate PDFs into originals and addendums
            list_pdfs = [pdf for pdf in os.listdir(project_path) if pdf.endswith(".pdf")]
            originals = []
            addendums = []
            for pdf_name in list_pdfs:
                pdf_path = os.path.join(project_path, pdf_name)
                if is_addendum(pdf_path, log_file):
                    addendums.append(pdf_name)
                else:
                    originals.append(pdf_name)
            # **END OF CHANGES**

            # **START OF CHANGES**
            # Process original PDFs first
            for pdf_name in originals:
                
                pdf_path = os.path.join(project_path, pdf_name)
                total_pdfs_accessed += 1

                is_add = is_addendum(pdf_path, log_file)

                # Check if PDF has 'Network Upgrade Type' column
                if has_network_upgrade_type_column(pdf_path, log_file):
                    style_n_pdfs.append(pdf_name)
                    print(f"Skipping PDF: {pdf_name} from Project {project_id} (Style N)", file=log_file)
                    # Still check if original has table7
                    has_table7 = check_has_table7(pdf_path)
                    original_has_table7[project_id] = has_table7
                    continue  # Skip processing this PDF

                print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)
                original_pdfs.append(pdf_name)

                try:
                    has_table7 = check_has_table7(pdf_path)
                    original_has_table7[project_id] = has_table7

                    if not has_table7:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 11)", file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 11)")
                        total_pdfs_skipped += 1
                        continue

                    if not base_data_extracted:
                        # Extract base data from original PDF
                        base_data = extract_base_data(pdf_path, project_id, log_file)
                        base_data_extracted = True
                        print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                    # Extract Table 11 and merge
                    '''
                    df = extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False)
                    if not df.empty:
                        core_originals = pd.concat([core_originals, df], ignore_index=True)
                        scraped_pdfs.append(pdf_name)
                        scraped_projects.add(project_id)
                        project_scraped = True
                        total_pdfs_scraped += 1
                        print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                    else:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                        total_pdfs_skipped += 1
                    '''
                        # Extract Table 11 and merge
                    df, status = extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False)

                    if status == "success":
                        core_originals = pd.concat([core_originals, df], ignore_index=True)
                        scraped_pdfs.append(pdf_name)
                        scraped_projects.add(project_id)
                        project_scraped = True
                        total_pdfs_scraped += 1
                        print(f"Scraped PDF: {pdf_name} from Project {project_id}")

                    elif status == "failed":
                        skipped_pdfs.append(pdf_name)
                        total_pdfs_skipped_extraction += 1
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (Table 11 found but extraction failed)"
                             )

                    else:  # status == "no_marker"
                        skipped_pdfs.append(pdf_name)
                        total_pdfs_skipped += 1
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 11 present)" )


                except Exception as e:
                    skipped_pdfs.append(pdf_name)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                    print(traceback.format_exc(), file=log_file)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                    total_pdfs_skipped += 1
            # **END OF CHANGES**

            # **START OF CHANGES**
            # Then process addendum PDFs
            for pdf_name in addendums:
                pdf_path = os.path.join(project_path, pdf_name)
                total_pdfs_accessed += 1
                is_add = is_addendum(pdf_path, log_file)

                # Check if PDF has 'Network Upgrade Type' column
                if has_network_upgrade_type_column(pdf_path, log_file):
                    style_n_pdfs.append(pdf_name)
                    print(f"Skipping PDF: {pdf_name} from Project {project_id} (Style N)", file=log_file)
                    continue  # Skip processing this PDF

                print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                addendum_pdfs.append(pdf_name)

                try:
                    has_table7 = check_has_table7(pdf_path)

                    if not has_table7:
                        if original_has_table7.get(project_id, False):
                            # Attempt to scrape alternative tables is no longer needed
                            # According to the latest request, alternative table scraping is removed
                            # Therefore, we skip addendum PDFs that do not have Table 11
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 11)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 11)")
                            total_pdfs_skipped += 1
                        else:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 11 and original does not have Table 11)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 11 and original does not have Table 11)")
                            total_pdfs_skipped += 1
                        continue

                    if not is_add and not base_data_extracted:
                        # Extract base data from original PDF
                        base_data = extract_base_data(pdf_path, project_id, log_file)
                        base_data_extracted = True
                        print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                    if is_add and base_data_extracted:
                        # For addendums, use the extracted base data
                        table7_data = extract_table7(pdf_path, log_file, is_addendum=is_add)
                        if table7_data.empty and original_has_table7.get(project_id, False):
                            # Scrape alternative tables is removed, so skip if no data
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No relevant tables found)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No relevant tables found)")
                            total_pdfs_skipped += 1
                        if not table7_data.empty:
                            # Merge base data with Table 11 data
                            merged_df = pd.concat([base_data] * len(table7_data), ignore_index=True)
                            merged_df = pd.concat([merged_df, table7_data], axis=1, sort=False)
                            core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                            scraped_pdfs.append(pdf_name)
                            scraped_projects.add(project_id)
                            project_scraped = True
                            total_pdfs_scraped += 1
                            print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                        else:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                            total_pdfs_skipped += 1
                except Exception as e:
                    skipped_pdfs.append(pdf_name)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                    print(traceback.format_exc(), file=log_file)
                    # Optionally, print to ipynb
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                    total_pdfs_skipped += 1
            # **END OF CHANGES**

            # After processing all PDFs for this project, check if any PDF was scraped
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)

    # Rest of the code remains unchanged...

    # After processing all PDFs, save to CSV
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")

    # Calculate total projects processed
    total_projects_processed = len(scraped_projects) + len(skipped_projects)

    # Print summary to ipynb
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped due to failed extraction of Table: {total_pdfs_skipped_extraction}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")

    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))

    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))

    print("\nList of Missing Projects:")
    print(sorted(missing_projects))

    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)

    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)

    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)

    print("\nList of Original PDFs:")
    print(original_pdfs)

    print("\nList of Style N PDFs (Skipped due to 'Network Upgrade Type'):")
    print(style_n_pdfs)

    print("\nTotal Number of Style N PDFs:", len(style_n_pdfs))

    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return

    # Clean up the entire DataFrame by cleaning string cells
    df = df.applymap(clean_string_cell)

    # Drop rows that contain specific phrases (e.g., "Type of Upgrade")
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]

    # Reorder columns as specified
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")

    # Ensure q_id is numeric for sorting, replace missing values with None
 

    # Save the DataFrame to CSV
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Will process these project‐folders:
['555', '557', '558', '559', '560', '568', '576', '577', '579', '581', '585', '586', '607']
Skipped PDF: Appendix A - Q557 C1C2 Phase II report - final.pdf from Project 557 (No Table 11)
Skipped PDF: Appendix A - Q559 C1C2 Phase II report - final.pdf from Project 559 (No Table 11)
Skipped PDF: Appendix A - Q560 C1C2 Phase II report - final.pdf from Project 560 (No Table 11)


KeyboardInterrupt: 

# Itemized and Addendums

In [36]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_rawdata_cluster2_style_R_originals.csv", dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



#df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

#df = df.map(clean_string_cell)
#df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())

After cleaning: ['q_id', 'cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection', 'type of upgrade', 'upgrade', 'description', 'cost allocation factor', 'estimated time to construct (note 3)', 'estimated cost x 1000 constant dollar (od year) (note 4)', 'upgrade (may include the following)', 'estimated cost x 1000 constant dollar (2011) (note 4)', 'Unnamed: 15', 'column_1', 'column_12', 'column_2', 'estimated_1', 'column_10', 'column_3', 'column_8', 'column_5', 'column_6', 'column', 'column_11', 'column_7', 'column_13', 'column_14', 'column_9', 'estimated', 'column_4', 'estimated time to construct (note 1)', 'estimated cost (x 1000)', 'estimated cost x 1000 constant dollar (od year) (note 2)']


# Originals

In [78]:
import pandas as pd
import re
import unicodedata
import numpy as np

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_rawdata_cluster2_style_R_originals.csv', dtype={'estimated_time_to_construct': str})

 
df['cluster'] = df['cluster'].astype('Int64')

df['cluster'] = df['cluster'].astype('Int64')


######################################################################################################################################
########################################
# STEP 0: CREATE DESCRIPTION COLUMN FROM COST ALLOCATION FACTOR


def move_non_numeric_text(value):
    """Move non-numeric, non-percentage text from cost allocation factor to description.
       If a value is moved, return None for cost allocation factor."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return value  # Keep numeric or percentage values
        return None  # Clear the value if it's text (moved to description)
    return value  # Return as is for non-string values


def extract_non_numeric_text(value):
    """Extract non-numeric, non-percentage text from the cost allocation factor column."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return None
        return value.strip()  # Return text entries as is
    return None  # Return None for non-string values



def clean_total_entries(value):
    """If the value starts with 'Total', remove numbers, commas, and percentage signs, keeping only 'Total'."""
    if isinstance(value, str) and value.startswith("Total"):
        return "Total"  # Keep only "Total"
    return value  # Leave other values unchanged

import re
import pandas as pd

def extract_cost_allocation(df, source_col, target_col="cost_allocation_factor"):
    """
    Extracts percentage values from a specified source column and moves them into a target column.
    
    - A percentage value is defined as a string that, when stripped of whitespace,
      fully matches a pattern of digits (with optional commas or periods) followed by a percent sign.
    - If a cell in the source column matches this pattern, its value is placed into the target column,
      and the source column cell is cleared (set to an empty string).
    - If the cell does not match a percentage pattern, it is left untouched in the source column.
    
    Parameters:
      df         : pandas DataFrame.
      source_col : string, the name of the column to scan for percentage values.
      target_col : string, the name of the column to store the extracted percentage values.
                   Defaults to "cost_allocation_factor".
    
    Returns:
      The DataFrame with the updated columns.
    """
    # Define a regex pattern to match a percentage value (e.g., "78.25%").
    # The pattern allows digits, commas, and periods, followed immediately by a "%" (ignoring leading/trailing spaces).
    pattern = r"^\s*[\d,\.]+%\s*$"
    
    def extract_percentage(text):
        # If text matches the percentage pattern, return the stripped text; otherwise, return None.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return text.strip()
        return None

    def clear_percentage(text):
        # If text matches the percentage pattern, clear it (return an empty string).
        # Otherwise, return the text stripped of surrounding whitespace.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return ""
        if isinstance(text, str):
            return text.strip()
        return text

    # Create (or overwrite) the target column with extracted percentage values from the source column.
    df[target_col] = df[source_col].apply(extract_percentage)
    # In the source column, remove any percentage values (leaving other text intact).
    df[source_col] = df[source_col].apply(clear_percentage)
    
    return df

 

 

def filter_numeric_costs(df, col):
    """
    For a given DataFrame and column name, this function extracts the numeric cost from each cell,
    converting values with an optional '$' sign (and possible commas) to floats.
    If a valid numeric cost cannot be extracted, the cell is set to NaN.
    
    Parameters:
      df  : pandas DataFrame.
      col : string, the name of the column to process.
      
    Returns:
      The original DataFrame with the specified column converted to numeric values (or NaN if conversion fails).
    """
    def extract_numeric(value):
        value_str = str(value)
        # This regex matches an optional '$', optional spaces, and a number with commas and an optional decimal part.
        match = re.search(r'\$?\s*([\d,]+(?:\.\d+)?)', value_str)
        if match:
            num_str = match.group(1).replace(',', '')
            try:
                return float(num_str)
            except ValueError:
                return np.nan
        return np.nan

    # Apply the extraction function to the specified column.
    df[col] = df[col].apply(extract_numeric)
    return df



def extract_months_values(df, col):
    """
    For a given DataFrame and column name, this function extracts text patterns matching
    durations expressed in months (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months").
    If a valid pattern is found, it returns the matched text; otherwise, it returns an empty string.
    
    Parameters:
        df  : pandas DataFrame.
        col : string, the name of the column to process.
        
    Returns:
        The DataFrame with the specified column updated.
    """
    def extract_months(text):
        text = str(text)
        # Pattern explanation:
        #   \d+          : one or more digits
        #   (?:-\d+)?    : optionally, a hyphen followed by one or more digits (to capture ranges like 21-29)
        #   \s*          : optional whitespace
        #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
        pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
        match = re.search(pattern, text)
        return match.group(1) if match else ""
    
    df[col] = df[col].apply(extract_months)
    return df

def move_months_values(df, source_col, target_col):
    """
    For a given DataFrame, this function extracts text patterns matching durations expressed in months
    (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months") from the source column,
    moves the extracted text to the target column, and removes it from the source column.
    
    Parameters:
        df         : pandas DataFrame.
        source_col : string, the name of the column to extract the month text from.
        target_col : string, the name of the column where the extracted month text will be moved.
        
    Returns:
        The updated DataFrame with the month values moved.
    """
    # Pattern explanation:
    #   \d+          : one or more digits
    #   (?:-\d+)?    : optionally, a hyphen and one or more digits (to capture ranges like 21-29)
    #   \s*          : optional whitespace
    #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
    pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
    
    def process_text(text):
        text = str(text)
        match = re.search(pattern, text)
        if match:
            extracted = match.group(1)
            # Remove the extracted text from the source text and clean up extra spaces
            updated_text = re.sub(pattern, "", text).strip()
            return extracted, updated_text
        else:
            return "", text

    # Prepare lists to store the extracted month text and the updated source text
    extracted_vals = []
    updated_source_vals = []
    
    for val in df[source_col]:
        ext, updated = process_text(val)
        extracted_vals.append(ext)
        updated_source_vals.append(updated)
    
    # Create/update the target column with the extracted month text
    df[target_col] = extracted_vals
    # Replace the source column values with the text after removal of the month text
    df[source_col] = updated_source_vals
    
    return df



# Filter numeric costs in 'estimated_cost_x_1000' and 'escalated_cost_x_1000' columns
 

#df = filter_numeric_costs(df, 'unnamed_10')

df = filter_numeric_costs(df, 'estimated')

 


df = extract_months_values(df, 'column_12')
df = extract_months_values(df, 'estimated_1')
df = extract_months_values(df, 'column_11')
df = extract_months_values(df, 'column_13')
#df = move_months_values(df, 'unnamed_13', 'estimated time to construct')





df['cost_allocation_factor']= None


#df = extract_cost_allocation(df, "unnamed_8", "cost_allocation_factor")

#df = extract_cost_allocation(df, "unnamed_9", "cost_allocation_factor")

# Create the 'description' column from 'cost allocation factor'
#if 'unnamed_9' in df.columns:
 #  df['unnamed_9'] = df['unnamed_9'].apply(extract_non_numeric_text)
  # df['unnamed_9'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values


#if 'unnamed_8' in df.columns:
 #  df['unnamed_8'] = df['unnamed_8'].apply(extract_non_numeric_text)
  # df['unnamed_8'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values   




######################################################################################################################################
########################################
#STEP 1 MERGE COLUMNS

def merge_columns(df):
    merge_columns_dict = {

        "upgrade": [
            "upgrade",
            "column_3",
            'upgrade (may include the following)',
 
            ],

        "capacity": [
            "capacity",
            "MW",
            
        ],   

        "description": ["description",
                         "column_5" ],

        "estimated_time_to_construct": [ 
            "column_12", 'estimated_1', "column_11", "estimated time to construct (note 1)",
            'estimated time to construct (note 3)', 'column_13',
 
                                         ],

        "type_of_upgrade": [ "type of upgrade", "column_1",],

        "type_of_upgrade_2": [ "column", 'column_6'  ],

        



        "estimated_cost_x_1000": [ 'column_10', 
             "estimated cost (x 1000)" ,  'estimated cost x 1000 constant dollar (2011) (note 4)', 
 
           

             
        ],    


        "escalated_cost_x_1000": [
            "escalated costs x 1000", "estimated cost x 1000 constant dollar (od year) (note 2)", "estimated",
            'estimated cost x 1000 constant dollar (od year) (note 4)',
            
 
            
             

        ],

         

        "total_estimated_cost_x_1000": [
            "total nu cost",
            "total cost constant"
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated"
        ],
       
         

        

        
        "cost_allocation_factor": [
            "column_7",
            'cost allocation factor', 
 
            
           

        ],
       
    }

   # 1) If there are any truly “unnamed” columns (blank names or starting with "Unnamed"),
    #    tack them onto the "description" group so they also get merged under "description".
    unnamed_columns = [
        col for col in df.columns
        if (pd.isna(col) or str(col).strip() == "" or str(col).lower().startswith("nnamed"))
    ]
    if unnamed_columns:
        # Only add those that aren’t already listed
        for uc in unnamed_columns:
            if uc not in merge_columns_dict["description"]:
                merge_columns_dict["description"].append(uc)

    # 2) For each (new_col → list_of_old_cols), build new_col by picking
    #    the first non‐missing value in row‐order. Then drop only the old columns
    #    (but keep new_col).
    for new_col, old_cols in merge_columns_dict.items():
        # (a) Restrict to columns that actually exist in df
        existing = [c for c in old_cols if c in df.columns]
        if not existing:
            continue

        # (b) Define a helper that returns the first non‐missing, non‐blank value
        def first_non_missing(row):
            for val in row:
                # treat "" or whitespace‐only strings as missing, too
                if pd.notna(val) and not (isinstance(val, str) and val.strip() == ""):
                    return val
            return pd.NA

        # (c) Apply it row‐wise to df[existing]
        df[new_col] = df[existing].apply(first_non_missing, axis=1)

        # (d) Drop only those source columns that are NOT equal to new_col.
        #     That way, if “upgrade” was already a column name, we don’t drop the newly created “upgrade” column,
        #     but _do_ drop “column4” (and any others in existing except new_col itself).
        to_drop = [c for c in existing if c != new_col]
        if to_drop:
            df.drop(columns=to_drop, inplace=True)

    return df

df = merge_columns(df)


######################################################################################################################################
########################################
# STEP 2: REMOVE DOLLAR SIGNED VALUES FROM 'estimated_time_to_construct'
######## Other clean up

def remove_dollar_values(value):
    """Remove dollar amounts (e.g., $3625.89, $3300) from 'estimated_time_to_construct'."""
    if isinstance(value, str) and re.search(r"^\$\d+(\.\d{1,2})?$", value.strip()):
        return None  # Replace with None if it's a dollar-signed number
    return value.strip() if isinstance(value, str) else value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(remove_dollar_values)


## Remove ranodm number in Total row:    
# Apply cleaning function to "upgrade" column after merging
#if 'upgrade' in df.columns:
 #   df['upgrade'] = df['upgrade'].apply(clean_total_entries)


 
 

    
######################################################################################################################################
########################################
# STEP 3: DROP UNNEEDED COLUMNS
 

df.drop(['column_2', 'column_8',"column_6", "Unnamed: 15",  "column_14", "column_9", "column_4"], axis=1, inplace=True, errors='ignore')



######################################################################################################################################
########################################
#STEP 4: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]



# Convert estimated_time_to_construct to integer (remove decimals) and keep NaNs as empty
#df['estimated_time_to_construct'] = pd.to_numeric(df['estimated_time_to_construct'], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else None)


 
def process_dataframe(df):
    """
    Processes the DataFrame as follows:
    
    1. Drops any rows where any of these columns are empty or blank:
       - 'upgrade', 'description', 'cost_allocation_factor',
         'estimated_time_to_construct', 'type_of_upgrade_2', 'estimated_cost_x_1000'
    
    2. For each remaining row, if the value in 'type_of_upgrade' starts with
       'SCE', 'SDG&E', or 'PG&E' (or is empty after stripping),
       then the value in 'type_of_upgrade_2' is replaced with the value from 'type_of_upgrade'.
       
    Parameters:
        df: pandas DataFrame.
        
    Returns:
        A cleaned DataFrame with the above processing applied.
    """
    # Define the required columns
    required_cols = [
        "upgrade", "description", "cost_allocation_factor",
        "estimated_time_to_construct", "type_of_upgrade_2", "estimated_cost_x_1000"
    ]
    
       # Make a copy of the DataFrame to avoid modifying the original
    df_clean = df.copy()

    
    
    # Replace NaN with empty strings for checking emptiness
    df_clean[required_cols] = df_clean[required_cols].fillna("")

    # Convert all required columns to strings and strip whitespace
    df_clean[required_cols] = df_clean[required_cols].applymap(lambda x: str(x).strip())
    
    
 # Drop rows where all required columns are empty
    df_clean = df_clean[~(df_clean[required_cols].apply(lambda row: all(row == ""), axis=1))]
    
 
    
    # Define a function to update type_of_upgrade_2 if needed.
    def update_type(row):
        # Get the value from type_of_upgrade (converted to string and stripped)
        val = str(row.get("type_of_upgrade", "")).strip()
        # If the value is empty or starts with SCE, SDG&E, or PG&E, then update type_of_upgrade_2
        if val == "" or re.match(r'^(SCE|SDG&E|PG&E)', val):
            row["type_of_upgrade"] = row["type_of_upgrade_2"]
        return row


    # Apply the function row-wise
    df_clean = df_clean.apply(update_type, axis=1)
    
    return df_clean


df = process_dataframe(df)


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "type_of_upgrade_2",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df     



df = reorder_columns(df)

 
 

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].replace("", np.nan).ffill() 


df= df[df['type_of_upgrade']!= '12. Local Furnishing Bonds']
df= df[df['type_of_upgrade']!= '(when applicable):']
df= df[df['type_of_upgrade']!= '12. Items Not Covered In This Study']




#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/cluster_2_style_R.csv', index=False)

######################################################################################################################################
########################################
#STEP 5: REMOVING TOTAL ROW, AS THE PDFS GIVE TOTAL NETWORK COST RATHER THAN BY RNU, LDNU AS WE HAD BEFORE
# Remove rows where upgrade is "Total" (case-insensitive)



df= df[df['type_of_upgrade']!= '12. Local Furnishing Bonds']
df= df[df['type_of_upgrade']!= '(when applicable):']
df= df[df['type_of_upgrade']!= '12. Items Not Covered In This Study']

df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/cluster_2_style_R.csv', index=False)

mask_agg = (
    df['type_of_upgrade'].fillna('').eq('Total') |
     df['type_of_upgrade'].fillna('').eq('Total Cost') |
    df['cost_allocation_factor'].fillna('').eq('Total')
)

# 2) Extract them
aggregate_total = df.loc[mask_agg].copy()

# 3) Tag them in the original df
df['is_aggregate_total'] = mask_agg


agg_data = df[df['is_aggregate_total']].copy()
agg_data.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_aggregate.csv', index=False) 

# 3) Then drop them from your main itemized set
df = df.loc[~mask_agg].reset_index(drop=True)

df.drop(columns=['is_aggregate_total'], inplace=True, errors='ignore')
 







######################################################################################################################################
########################################
# STEP 6: Move upgrade phrases like IRNU from upgrade column to a new column upgrade_classificatio and also replace type_of_upgrade with LDNU, CANU



# Define the list of phrases for upgrade classification
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]

 



#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)  


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )


mappings = {
    "PTO’s Interconnection Facilities (Note 2)": "PTO_IF",
    "PTO’s Interconnectio n Facilities (Note 2)": "PTO_IF",
    "PTOs Interconnection Facilities": "PTO_IF",
    "PTOs Interconnectio n Facilities": "PTO_IF",
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "Delivery Network Upgrades": "LDNU",
 "Delivery Network": "ADNU",
 "Plan of Service Reliability Network Upgrades": "RNU",
 "Distribution Upgrades": "LDNU",
 "PG&E Reliability Network Upgrades": "RNU",
 "SDG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Upgrades": "LDNU",
 "SCE Distribution Upgrades": "LDNU",
 "SCE Reliability Network Upgrades for Short Circuit duty": "RNU",
 "SCE Network Upgrades": "RNU",
 "Plan of Service Distribution Upgrades": "LDNU",
 "PG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Network Upgrades": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 - Continued": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 -": "LDNU",
 "Reliability Network Upgrades to Physically Interconnect": "RNU",
 'Reliability Network Upgrade': "RNU",
 "Reliability Network Upgrades": "RNU",
    "Local Delivery Network Upgrades": "LDNU",
    "Area Deliverability Upgrades": "ADNU",
    "Escalated Cost and Time to Construct for Interconnection Facilities, Reliability Network Upgrades, and Delivery Network Upgrades": "LDNU",
    "Distribution": "ADNU",
'Total PTO_IF': 'PTO_IF',
 'Total RNU': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
}





if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].ffill()    






######################################################################################################################################
########################################
#STEP 7: Stable sort type of upgrade

def stable_sort_by_type_of_upgrade(df):
    """Performs a stable sort within each q_id to order type_of_upgrade while preserving row order in other columns."""
    
    # Define the custom sorting order for type_of_upgrade
    type_order = {"PTO_IF": 1, "RNU": 2, "LDNU": 3, "PNU": 4, "ADNU": 5}

    # Assign a numerical sorting key; use a high number if type_of_upgrade is missing
    df['sort_key'] = df['type_of_upgrade'].map(lambda x: type_order.get(x, 99))

    # Perform a stable sort by q_id first, then by type_of_upgrade using the custom order
    df = df.sort_values(by=['q_id', 'sort_key'], kind='stable').drop(columns=['sort_key'])

    return df

# Apply stable sorting
  


df = df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()


df= reorder_columns(df)

#df = stable_sort_by_type_of_upgrade(df)  
#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)
######################################################################################################################################
########################################
# STEP 8: Remove $ signs and convert to numeric
 

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries
    




# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)




df = df[df["type_of_upgrade"] != "may"]    
######################################################################################################################################
########################################
# STEP 9: Create Total rows


df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)


  

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    print(f"\nProcessing q_id: {q_id}")  # Debug print
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

        # Debug: Print current group
        print(f"\nChecking Upgrade: {upgrade}, Total Rows Present?:", 
              ( (group['item'] == 'no')).any())

        # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = ((group['item'] == 'no')).any()
        
        if total_exists:
            print(f"Skipping Total row for {upgrade} (already exists).")
            continue
        
        total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings
        total_row['q_id'] = q_id
        total_row['type_of_upgrade'] = f"Total {upgrade}"
        total_row['item'] = 'no'

        # Populate specified columns from the existing row
        first_row = rows.iloc[0]
        for col in columns_to_populate:
            if col in df.columns:
                total_row[col] = first_row[col]

        # Sum the numeric columns
        for col in columns_to_sum:
            if col in rows.columns:
                total_row[col] = rows[col].sum()
            else:
                total_row[col] = 0  # Default to 0 if column is missing

        print(f"Creating Total row for {upgrade}")  # Debug print
        new_rows.append(total_row)

# Convert list to DataFrame and append
if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    print("\nNew Total Rows Created:\n", total_rows_df)  # Debug print
    # 1) Diagnose
    dups = df.columns[df.columns.duplicated()]
    print("👉 Duplicate column names in df:", dups.tolist())

    # 2) Drop perfect duplicates
    df = df.loc[:, ~df.columns.duplicated()]
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


df = stable_sort_by_type_of_upgrade(df)

 


#: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]




#  Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )
    

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )






def clean_estimated_time(value):
    """
    Removes the word 'month' or 'months' (case insensitive) from the value.
    Leaves behind any numbers or number ranges (e.g. "6", "6-12").
    """
    if isinstance(value, str):
        # Remove 'month' or 'months' (case-insensitive), optionally with spaces around them.
        cleaned_value = re.sub(r'(?i)\s*months?\s*', '', value)
        
        return cleaned_value.strip()
    return value





# Then apply it to your column, for example with Pandas:
df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)



def clean_estimated_time(value):
    if isinstance(value, str):
         
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)


def pick_max_from_range(val):
    """
    Given a value like "12-24" or " 6 - 18 " (or even "20"), return the larger number.
    If nothing can be parsed, returns np.nan.
    """
    if pd.isna(val):
        return np.nan

    s = str(val).strip()
    # Split on hyphen (either ASCII "-" or any unicode dash)
    parts = re.split(r'\s*[-–—]\s*', s)
    nums = []
    for part in parts:
        try:
            # Convert each piece to float (or int)
            nums.append(float(part))
        except ValueError:
            # If it isn’t purely a number, skip it
            continue

    if not nums:
        return np.nan
    return max(nums)

# Then apply it:
df["estimated_time_to_construct"] = df["estimated_time_to_construct"]\
    .apply(pick_max_from_range)    


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


if 'upgrade' in df.columns:
    df['upgrade'] = df['upgrade'].ffill()      


df.drop('type_of_upgrade_2', axis=1, inplace=True, errors='ignore') 

#df= reorder_columns(df)



# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    totals_df = df[df['item'] == 'no'].drop(columns=totals_columns, errors='ignore')
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_2_cluster_2_style_R_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_2_cluster_2_style_R_total.csv'.")


if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())



Processing q_id: 552

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 561

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 565

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 569

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 574

C

  df_clean[required_cols] = df_clean[required_cols].applymap(lambda x: str(x).strip())
  df[col] = df[col].fillna(0)


# Addendums

In [74]:
import pandas as pd
import re
import unicodedata
import numpy as np

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/ph2_rawdata_cluster2_style_R_addendums.csv', dtype={'estimated_time_to_construct': str})

 
df['cluster'] = df['cluster'].astype('Int64')

df['cluster'] = df['cluster'].astype('Int64')


######################################################################################################################################
########################################
# STEP 0: CREATE DESCRIPTION COLUMN FROM COST ALLOCATION FACTOR


def move_non_numeric_text(value):
    """Move non-numeric, non-percentage text from cost allocation factor to description.
       If a value is moved, return None for cost allocation factor."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return value  # Keep numeric or percentage values
        return None  # Clear the value if it's text (moved to description)
    return value  # Return as is for non-string values


def extract_non_numeric_text(value):
    """Extract non-numeric, non-percentage text from the cost allocation factor column."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return None
        return value.strip()  # Return text entries as is
    return None  # Return None for non-string values



def clean_total_entries(value):
    """If the value starts with 'Total', remove numbers, commas, and percentage signs, keeping only 'Total'."""
    if isinstance(value, str) and value.startswith("Total"):
        return "Total"  # Keep only "Total"
    return value  # Leave other values unchanged

import re
import pandas as pd

def extract_cost_allocation(df, source_col, target_col="cost_allocation_factor"):
    """
    Extracts percentage values from a specified source column and moves them into a target column.
    
    - A percentage value is defined as a string that, when stripped of whitespace,
      fully matches a pattern of digits (with optional commas or periods) followed by a percent sign.
    - If a cell in the source column matches this pattern, its value is placed into the target column,
      and the source column cell is cleared (set to an empty string).
    - If the cell does not match a percentage pattern, it is left untouched in the source column.
    
    Parameters:
      df         : pandas DataFrame.
      source_col : string, the name of the column to scan for percentage values.
      target_col : string, the name of the column to store the extracted percentage values.
                   Defaults to "cost_allocation_factor".
    
    Returns:
      The DataFrame with the updated columns.
    """
    # Define a regex pattern to match a percentage value (e.g., "78.25%").
    # The pattern allows digits, commas, and periods, followed immediately by a "%" (ignoring leading/trailing spaces).
    pattern = r"^\s*[\d,\.]+%\s*$"
    
    def extract_percentage(text):
        # If text matches the percentage pattern, return the stripped text; otherwise, return None.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return text.strip()
        return None

    def clear_percentage(text):
        # If text matches the percentage pattern, clear it (return an empty string).
        # Otherwise, return the text stripped of surrounding whitespace.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return ""
        if isinstance(text, str):
            return text.strip()
        return text

    # Create (or overwrite) the target column with extracted percentage values from the source column.
    df[target_col] = df[source_col].apply(extract_percentage)
    # In the source column, remove any percentage values (leaving other text intact).
    df[source_col] = df[source_col].apply(clear_percentage)
    
    return df

 

 

def filter_numeric_costs(df, col):
    """
    For a given DataFrame and column name, this function extracts the numeric cost from each cell,
    converting values with an optional '$' sign (and possible commas) to floats.
    If a valid numeric cost cannot be extracted, the cell is set to NaN.
    
    Parameters:
      df  : pandas DataFrame.
      col : string, the name of the column to process.
      
    Returns:
      The original DataFrame with the specified column converted to numeric values (or NaN if conversion fails).
    """
    def extract_numeric(value):
        value_str = str(value)
        # This regex matches an optional '$', optional spaces, and a number with commas and an optional decimal part.
        match = re.search(r'\$?\s*([\d,]+(?:\.\d+)?)', value_str)
        if match:
            num_str = match.group(1).replace(',', '')
            try:
                return float(num_str)
            except ValueError:
                return np.nan
        return np.nan

    # Apply the extraction function to the specified column.
    df[col] = df[col].apply(extract_numeric)
    return df



def extract_months_values(df, col):
    """
    For a given DataFrame and column name, this function extracts text patterns matching
    durations expressed in months (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months").
    If a valid pattern is found, it returns the matched text; otherwise, it returns an empty string.
    
    Parameters:
        df  : pandas DataFrame.
        col : string, the name of the column to process.
        
    Returns:
        The DataFrame with the specified column updated.
    """
    def extract_months(text):
        text = str(text)
        # Pattern explanation:
        #   \d+          : one or more digits
        #   (?:-\d+)?    : optionally, a hyphen followed by one or more digits (to capture ranges like 21-29)
        #   \s*          : optional whitespace
        #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
        pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
        match = re.search(pattern, text)
        return match.group(1) if match else ""
    
    df[col] = df[col].apply(extract_months)
    return df

def move_months_values(df, source_col, target_col):
    """
    For a given DataFrame, this function extracts text patterns matching durations expressed in months
    (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months") from the source column,
    moves the extracted text to the target column, and removes it from the source column.
    
    Parameters:
        df         : pandas DataFrame.
        source_col : string, the name of the column to extract the month text from.
        target_col : string, the name of the column where the extracted month text will be moved.
        
    Returns:
        The updated DataFrame with the month values moved.
    """
    # Pattern explanation:
    #   \d+          : one or more digits
    #   (?:-\d+)?    : optionally, a hyphen and one or more digits (to capture ranges like 21-29)
    #   \s*          : optional whitespace
    #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
    pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
    
    def process_text(text):
        text = str(text)
        match = re.search(pattern, text)
        if match:
            extracted = match.group(1)
            # Remove the extracted text from the source text and clean up extra spaces
            updated_text = re.sub(pattern, "", text).strip()
            return extracted, updated_text
        else:
            return "", text

    # Prepare lists to store the extracted month text and the updated source text
    extracted_vals = []
    updated_source_vals = []
    
    for val in df[source_col]:
        ext, updated = process_text(val)
        extracted_vals.append(ext)
        updated_source_vals.append(updated)
    
    # Create/update the target column with the extracted month text
    df[target_col] = extracted_vals
    # Replace the source column values with the text after removal of the month text
    df[source_col] = updated_source_vals
    
    return df



# Filter numeric costs in 'estimated_cost_x_1000' and 'escalated_cost_x_1000' columns
 

#df = filter_numeric_costs(df, 'unnamed_10')
 
#df = move_months_values(df, 'unnamed_13', 'estimated time to construct')





df['cost_allocation_factor']= None


#df = extract_cost_allocation(df, "unnamed_8", "cost_allocation_factor")

#df = extract_cost_allocation(df, "unnamed_9", "cost_allocation_factor")

# Create the 'description' column from 'cost allocation factor'
#if 'unnamed_9' in df.columns:
 #  df['unnamed_9'] = df['unnamed_9'].apply(extract_non_numeric_text)
  # df['unnamed_9'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values


#if 'unnamed_8' in df.columns:
 #  df['unnamed_8'] = df['unnamed_8'].apply(extract_non_numeric_text)
  # df['unnamed_8'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values   




######################################################################################################################################
########################################
#STEP 1 MERGE COLUMNS

def merge_columns(df):
    merge_columns_dict = {

        "upgrade": [
            "upgrade",
            "column_3",
            'upgrade (may include the following)',
 
            ],

        "capacity": [
            "capacity",
            "MW",
            
        ],   

        "description": ["description",
                         "column_5" ],

        "estimated_time_to_construct": [ 
            "column_12", 'estimated_1', "column_11", "estimated time to construct (note 1)",
            'estimated time to construct (note 3)', 'column_13',
 
                                         ],

        "type_of_upgrade": [ "type of upgrade",  "column_1",],

         

        



        "estimated_cost_x_1000": [ 'column_10', 
             "estimated cost (x 1000)" ,  'estimated cost x 1000 constant dollar (2011) (note 4)', 
 
           

             
        ],    


        "escalated_cost_x_1000": [
            "escalated costs x 1000", "estimated cost x 1000 constant dollar (od year) (note 2)", "estimated",
            'estimated cost x 1000 constant dollar (od year) (note 4)',
            
 
            
             

        ],

         

        "total_estimated_cost_x_1000": [
            "total nu cost",
            "total cost constant"
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated"
        ],
       
         

        

        
        "cost_allocation_factor": [
            "column_7",
            'cost allocation factor', 
 
            
           

        ],
       
    }

   # 1) If there are any truly “unnamed” columns (blank names or starting with "Unnamed"),
    #    tack them onto the "description" group so they also get merged under "description".
    unnamed_columns = [
        col for col in df.columns
        if (pd.isna(col) or str(col).strip() == "" or str(col).lower().startswith("nnamed"))
    ]
    if unnamed_columns:
        # Only add those that aren’t already listed
        for uc in unnamed_columns:
            if uc not in merge_columns_dict["description"]:
                merge_columns_dict["description"].append(uc)

    # 2) For each (new_col → list_of_old_cols), build new_col by picking
    #    the first non‐missing value in row‐order. Then drop only the old columns
    #    (but keep new_col).
    for new_col, old_cols in merge_columns_dict.items():
        # (a) Restrict to columns that actually exist in df
        existing = [c for c in old_cols if c in df.columns]
        if not existing:
            continue

        # (b) Define a helper that returns the first non‐missing, non‐blank value
        def first_non_missing(row):
            for val in row:
                # treat "" or whitespace‐only strings as missing, too
                if pd.notna(val) and not (isinstance(val, str) and val.strip() == ""):
                    return val
            return pd.NA

        # (c) Apply it row‐wise to df[existing]
        df[new_col] = df[existing].apply(first_non_missing, axis=1)

        # (d) Drop only those source columns that are NOT equal to new_col.
        #     That way, if “upgrade” was already a column name, we don’t drop the newly created “upgrade” column,
        #     but _do_ drop “column4” (and any others in existing except new_col itself).
        to_drop = [c for c in existing if c != new_col]
        if to_drop:
            df.drop(columns=to_drop, inplace=True)

    return df

df = merge_columns(df)


######################################################################################################################################
########################################
# STEP 2: REMOVE DOLLAR SIGNED VALUES FROM 'estimated_time_to_construct'
######## Other clean up

def remove_dollar_values(value):
    """Remove dollar amounts (e.g., $3625.89, $3300) from 'estimated_time_to_construct'."""
    if isinstance(value, str) and re.search(r"^\$\d+(\.\d{1,2})?$", value.strip()):
        return None  # Replace with None if it's a dollar-signed number
    return value.strip() if isinstance(value, str) else value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(remove_dollar_values)


def pick_max_from_range(val):
    """
    Given a value like "12-24" or " 6 - 18 " (or even "20"), return the larger number.
    If nothing can be parsed, returns np.nan.
    """
    if pd.isna(val):
        return np.nan

    s = str(val).strip()
    # Split on hyphen (either ASCII "-" or any unicode dash)
    parts = re.split(r'\s*[-–—]\s*', s)
    nums = []
    for part in parts:
        try:
            # Convert each piece to float (or int)
            nums.append(float(part))
        except ValueError:
            # If it isn’t purely a number, skip it
            continue

    if not nums:
        return np.nan
    return max(nums)

# Then apply it:
df["estimated_time_to_construct"] = df["estimated_time_to_construct"]\
    .apply(pick_max_from_range)    


## Remove ranodm number in Total row:    
# Apply cleaning function to "upgrade" column after merging
#if 'upgrade' in df.columns:
 #   df['upgrade'] = df['upgrade'].apply(clean_total_entries)


 
 

    
######################################################################################################################################
########################################
# STEP 3: DROP UNNEEDED COLUMNS
 

df.drop(['column_2', 'column_8',"column_6", "Unnamed: 15",  "column_14", "column_9", "column_4"], axis=1, inplace=True, errors='ignore')



######################################################################################################################################
########################################
#STEP 4: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]



# Convert estimated_time_to_construct to integer (remove decimals) and keep NaNs as empty
#df['estimated_time_to_construct'] = pd.to_numeric(df['estimated_time_to_construct'], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else None)

df= df[df['type_of_upgrade']!= '12. Local Furnishing Bonds']
df= df[df['type_of_upgrade']!= '(when applicable):']
df= df[df['type_of_upgrade']!= '12. Items Not Covered In This Study']

df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/cluster_2_style_R_addendums.csv', index=False)

mask_agg = (
    df['type_of_upgrade'].fillna('').eq('Total') |
     df['type_of_upgrade'].fillna('').eq('Total Cost') |
    df['cost_allocation_factor'].fillna('').eq('Total')
)

# 2) Extract them
aggregate_total = df.loc[mask_agg].copy()

# 3) Tag them in the original df
df['is_aggregate_total'] = mask_agg


agg_data = df[df['is_aggregate_total']].copy()
agg_data.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_aggregate_addendums.csv', index=False) 

# 3) Then drop them from your main itemized set
df = df.loc[~mask_agg].reset_index(drop=True)

df.drop(columns=['is_aggregate_total'], inplace=True, errors='ignore')
 
 


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "type_of_upgrade_2",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df     



df = reorder_columns(df)

 
 

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].replace("", np.nan).ffill() 


df= df[df['type_of_upgrade']!= '12. Local Furnishing Bonds']
df= df[df['type_of_upgrade']!= '(when applicable):']
df= df[df['type_of_upgrade']!= '12. Items Not Covered In This Study']




#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/03_raw/cluster_2_style_R.csv', index=False)

######################################################################################################################################
########################################
#STEP 5: REMOVING TOTAL ROW, AS THE PDFS GIVE TOTAL NETWORK COST RATHER THAN BY RNU, LDNU AS WE HAD BEFORE
# Remove rows where upgrade is "Total" (case-insensitive)




 







######################################################################################################################################
########################################
# STEP 6: Move upgrade phrases like IRNU from upgrade column to a new column upgrade_classificatio and also replace type_of_upgrade with LDNU, CANU



# Define the list of phrases for upgrade classification
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]

 



#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)  


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )


mappings = {
    "PTO’s Interconnection Facilities (Note 2)": "PTO_IF",
    "PTO’s Interconnectio n Facilities (Note 2)": "PTO_IF",
    "PTOs Interconnection Facilities": "PTO_IF",
    "PTOs Interconnectio n Facilities": "PTO_IF",
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "Delivery Network Upgrades": "LDNU",
 "Delivery Network": "ADNU",
 "Plan of Service Reliability Network Upgrades": "RNU",
 "Distribution Upgrades": "LDNU",
 "PG&E Reliability Network Upgrades": "RNU",
 "SDG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Upgrades": "LDNU",
 "SCE Distribution Upgrades": "LDNU",
 "SCE Reliability Network Upgrades for Short Circuit duty": "RNU",
 "SCE Network Upgrades": "RNU",
 "Plan of Service Distribution Upgrades": "LDNU",
 "PG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Network Upgrades": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 - Continued": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 -": "LDNU",
 "Reliability Network Upgrades to Physically Interconnect": "RNU",
 'Reliability Network Upgrade': "RNU",
 "Reliability Network Upgrades": "RNU",
    "Local Delivery Network Upgrades": "LDNU",
    "Area Deliverability Upgrades": "ADNU",
    "Escalated Cost and Time to Construct for Interconnection Facilities, Reliability Network Upgrades, and Delivery Network Upgrades": "LDNU",
    "Distribution": "ADNU",
'Total PTO_IF': 'PTO_IF',
 'Total RNU': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
}





if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].ffill()    






######################################################################################################################################
########################################
#STEP 7: Stable sort type of upgrade

def stable_sort_by_type_of_upgrade(df):
    """Performs a stable sort within each q_id to order type_of_upgrade while preserving row order in other columns."""
    
    # Define the custom sorting order for type_of_upgrade
    type_order = {"PTO_IF": 1, "RNU": 2, "LDNU": 3, "PNU": 4, "ADNU": 5}

    # Assign a numerical sorting key; use a high number if type_of_upgrade is missing
    df['sort_key'] = df['type_of_upgrade'].map(lambda x: type_order.get(x, 99))

    # Perform a stable sort by q_id first, then by type_of_upgrade using the custom order
    df = df.sort_values(by=['q_id', 'sort_key'], kind='stable').drop(columns=['sort_key'])

    return df

# Apply stable sorting
  


df = df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()


df= reorder_columns(df)

#df = stable_sort_by_type_of_upgrade(df)  
#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)
######################################################################################################################################
########################################
# STEP 8: Remove $ signs and convert to numeric
 

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries
    




# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)




df = df[df["type_of_upgrade"] != "may"]    
######################################################################################################################################
########################################
# STEP 9: Create Total rows


df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)


  

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    print(f"\nProcessing q_id: {q_id}")  # Debug print
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

        # Debug: Print current group
        print(f"\nChecking Upgrade: {upgrade}, Total Rows Present?:", 
              ( (group['item'] == 'no')).any())

        # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = ((group['item'] == 'no')).any()
        
        if total_exists:
            print(f"Skipping Total row for {upgrade} (already exists).")
            continue
        
        total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings
        total_row['q_id'] = q_id
        total_row['type_of_upgrade'] = f"Total {upgrade}"
        total_row['item'] = 'no'

        # Populate specified columns from the existing row
        first_row = rows.iloc[0]
        for col in columns_to_populate:
            if col in df.columns:
                total_row[col] = first_row[col]

        # Sum the numeric columns
        for col in columns_to_sum:
            if col in rows.columns:
                total_row[col] = rows[col].sum()
            else:
                total_row[col] = 0  # Default to 0 if column is missing

        print(f"Creating Total row for {upgrade}")  # Debug print
        new_rows.append(total_row)

# Convert list to DataFrame and append
if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    print("\nNew Total Rows Created:\n", total_rows_df)  # Debug print
    # 1) Diagnose
    dups = df.columns[df.columns.duplicated()]
    print("👉 Duplicate column names in df:", dups.tolist())

    # 2) Drop perfect duplicates
    df = df.loc[:, ~df.columns.duplicated()]
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


df = stable_sort_by_type_of_upgrade(df)

 


#: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]




#  Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )
    

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )






def clean_estimated_time(value):
    """
    Removes the word 'month' or 'months' (case insensitive) from the value.
    Leaves behind any numbers or number ranges (e.g. "6", "6-12").
    """
    if isinstance(value, str):
        # Remove 'month' or 'months' (case-insensitive), optionally with spaces around them.
        cleaned_value = re.sub(r'(?i)\s*months?\s*', '', value)
        
        return cleaned_value.strip()
    return value





# Then apply it to your column, for example with Pandas:
df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)


def clean_estimated_time(value):
    if isinstance(value, str):
         
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


if 'upgrade' in df.columns:
    df['upgrade'] = df['upgrade'].ffill()      


df.drop('type_of_upgrade_2', axis=1, inplace=True, errors='ignore') 

#df= reorder_columns(df)



# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_itemized_addendums.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    totals_df = df[df['item'] == 'no'].drop(columns=totals_columns, errors='ignore')
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_total_addendums.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_2_cluster_2_style_R_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_2_cluster_2_style_R_total.csv'.")


if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())



Processing q_id: 558

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 569

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

New Total Rows Created:
    q_id  cluster  req_deliverability  latitude  longitude capacity  \
0   558        2                 NaN       NaN        NaN     None   
1   558        2                 NaN       NaN        NaN     None   
2   558        2                 NaN       NaN        NaN     None   
3   569        2                 NaN       NaN        NaN     None   
4   569        2                 NaN       NaN        NaN     None   

                      point_of_interconnection type_of_upgrade upgrade  \
0                              

# Merge- Complete replace

In [2]:
df1 = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster SGIP-TC/02_intermediate/costs_phase_2_cluster_SGIP-TC_style_R_itemized.csv')
df2 = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster SGIP-TC/02_intermediate/costs_phase_2_cluster_SGIP-TC_style_R_total.csv')

df1.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster SGIP-TC/01_clean/costs_phase_2_cluster_1_style_SGIP-TC_itemized_updated.csv', index=False)
df2.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster SGIP-TC/01_clean/costs_phase_2_cluster_1_style_SGIP-TC_total_updated.csv', index=False)

# Checking Scraped Data

# Orignals only

# Trying to compare the total cost across all types of upgrade as that is given in the pdfs

In [1]:
import pandas as pd

# ---------------------- Configuration ---------------------- #

ITEMIZED_CSV_PATH       = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_itemized.csv'
TOTALS_CSV_PATH         = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_total.csv'
AGGREGATE_CSV_PATH      = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_aggregate.csv'

TOTALS_ESTIMATED_COLUMN = 'estimated_cost_x_1000'
TOTALS_ESCALATED_COLUMN = 'escalated_cost_x_1000'

REQUIRED_UPGRADES       = ['PTO_IF', 'RNU', 'LDNU', 'ADNU']

MISMATCHES_CSV_PATH     = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/mismatches.csv'

# ---------------------- Load Data ---------------------- #

itemized_df = pd.read_csv(ITEMIZED_CSV_PATH, dtype={'type_of_upgrade': str})
totals_df   = pd.read_csv(TOTALS_CSV_PATH, dtype={'type_of_upgrade': str})
agg_df      = pd.read_csv(AGGREGATE_CSV_PATH, dtype=str)

# ---------------------- Clean aggregate costs ---------------------- #

# Remove $ and commas, then convert to float
for col in [TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]:
    agg_df[col] = (
        agg_df[col]
        .str.replace(r'[\$,]', '', regex=True)
        .astype(float)
        .fillna(0.0)
    )

# ---------------------- Build aggregate lookup ---------------------- #

agg_grouped = (
    agg_df
    .groupby('q_id', as_index=False)
    .agg({
        TOTALS_ESTIMATED_COLUMN: 'sum',
        TOTALS_ESCALATED_COLUMN: 'sum'
    })
)

# create lookup
agg_lookup = agg_grouped.set_index('q_id').to_dict(orient='index')
agg_qids   = set(agg_grouped['q_id'])

# ---------------------- Numeric convert itemized ---------------------- #

for col in ['estimated_cost_x_1000','escalated_cost_x_1000']:
    itemized_df[col] = (
        itemized_df[col]
        .astype(str)
        .str.replace(r'[\$,]', '', regex=True)
        .astype(float)
        .fillna(0.0)
    )

# ---------------------- Check missing upgrades for  Q_ids ---------------------- #

# ---------------------- Check missing upgrades in totals_df (unconditionally) ---------------------- #

print("=== Missing required upgrades in totals dataset ===")
missing = []
for q in sorted(totals_df['q_id'].unique()):
    ups = (
        totals_df
        .loc[totals_df['q_id'] == q, 'type_of_upgrade']
        .dropna()
        .unique()
        .tolist()
    )
    miss = [u for u in REQUIRED_UPGRADES if u not in ups]
    if miss:
        missing.append((q, miss))

if missing:
    for q, miss in missing:
        print(f"Q_id {q} missing: {miss}")
else:
    print("None — every Q_id has all required upgrades in totals_df.")


# ---------------------- Check duplicate upgrades in totals dataset ---------------------- #

print("\n=== Duplicate upgrades in totals dataset ===")
dups = []
for q, group in totals_df.groupby('q_id'):
    dup_types = group['type_of_upgrade'][group['type_of_upgrade'].duplicated()].unique().tolist()
    if dup_types:
        dups.append((q, dup_types))

if dups:
    for q, dup in dups:
        print(f"Q_id {q} duplicates: {dup}")
else:
    print("No duplicates found in totals dataset.")

# ---------------------- Compute per-q_id itemized total ---------------------- #

itemized_totals = (
    itemized_df[itemized_df['q_id'].isin(agg_qids)]
    .groupby('q_id', as_index=False)
    .agg({
        'estimated_cost_x_1000':'sum',
        'escalated_cost_x_1000':'sum'
    })
)

itemized_totals['itemized_total'] = itemized_totals.apply(
    lambda r: r['estimated_cost_x_1000'] if r['estimated_cost_x_1000']>0 else r['escalated_cost_x_1000'],
    axis=1
)

# ---------------------- Compare against aggregate totals ---------------------- #

mismatches = []
for _, row in itemized_totals.iterrows():
    q = row['q_id']
    it = row['itemized_total']
    av = agg_lookup[q][TOTALS_ESTIMATED_COLUMN] if agg_lookup[q][TOTALS_ESTIMATED_COLUMN]>0 else agg_lookup[q][TOTALS_ESCALATED_COLUMN]
    # skip both zero
    if it==0 and av==0:
        continue
    if abs(it - av) > 1e-6:
        mismatches.append({
            'q_id': q,
            'itemized_total': it,
            'aggregate_total': av,
            'difference': it - av
        })

mismatches_df = pd.DataFrame(mismatches)

# ---------------------- Report & Save ---------------------- #

if mismatches_df.empty:
    print("\n✅ All itemized sums match the aggregate totals for Q_ids in aggregate.")
else:
    print(f"\n⚠️  Found {len(mismatches_df)} mismatches:")
    print(mismatches_df)

mismatches_df.to_csv(MISMATCHES_CSV_PATH, index=False)
print(f"\nMismatches written to {MISMATCHES_CSV_PATH}")


=== Missing required upgrades in totals dataset ===
Q_id 552 missing: ['ADNU']
Q_id 561 missing: ['ADNU']
Q_id 565 missing: ['ADNU']
Q_id 569 missing: ['LDNU', 'ADNU']
Q_id 574 missing: ['ADNU']
Q_id 583 missing: ['ADNU']
Q_id 588 missing: ['ADNU']
Q_id 589 missing: ['ADNU']
Q_id 590 missing: ['ADNU']
Q_id 593 missing: ['ADNU']
Q_id 602 missing: ['ADNU']
Q_id 606 missing: ['LDNU', 'ADNU']
Q_id 608 missing: ['ADNU']

=== Duplicate upgrades in totals dataset ===
No duplicates found in totals dataset.

✅ All itemized sums match the aggregate totals for Q_ids in aggregate.

Mismatches written to /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/mismatches.csv


# addendums

In [61]:
import pandas as pd

# ---------------------- Configuration ---------------------- #

ITEMIZED_CSV_PATH       = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_itemized_addendums.csv'
TOTALS_CSV_PATH         = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_total_addendums.csv'
AGGREGATE_CSV_PATH      = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/02_intermediate/costs_phase_2_cluster_2_style_R_aggregate_addendums.csv'

TOTALS_ESTIMATED_COLUMN = 'estimated_cost_x_1000'
TOTALS_ESCALATED_COLUMN = 'escalated_cost_x_1000'


REQUIRED_UPGRADES       = ['PTO_IF', 'RNU', 'LDNU', 'ADNU']

MISMATCHES_CSV_PATH     = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/mismatches.csv'

# ---------------------- Load Data ---------------------- #

itemized_df = pd.read_csv(ITEMIZED_CSV_PATH, dtype={'type_of_upgrade': str})
totals_df   = pd.read_csv(TOTALS_CSV_PATH, dtype={'type_of_upgrade': str})
agg_df      = pd.read_csv(AGGREGATE_CSV_PATH, dtype=str)



# ---------------------- Clean aggregate costs ---------------------- #

dfs = [agg_df, itemized_df, totals_df]
for df in dfs:
    if TOTALS_ESTIMATED_COLUMN not in df.columns:
        df[TOTALS_ESTIMATED_COLUMN] = np.nan

    if TOTALS_ESCALATED_COLUMN not in df.columns:
        df[TOTALS_ESCALATED_COLUMN] = np.nan



# Remove $ and commas, then convert to float
for col in [TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]:
    # 1) If the column doesn't exist, create it as empty strings:
    if col not in agg_df.columns:
        agg_df[col] = ""
    # 2) Make sure anything that’s not already a string becomes a string,
    #    and replace NaN with the empty string so that .str won’t blow up:
    agg_df[col] = agg_df[col].fillna("").astype(str)
    # 3) Now remove any “$” or “,”, coerce to numeric, and fill remaining NaN with 0.0:
    agg_df[col] = (
        agg_df[col]
        .str.replace(r"[\$,]", "", regex=True)
        .pipe(pd.to_numeric, errors="coerce")  # convert the stripped string to float or NaN
        .fillna(0.0)
    )
# ---------------------- Build aggregate lookup ---------------------- #

agg_grouped = (
    agg_df
    .groupby('q_id', as_index=False)
    .agg({
        TOTALS_ESTIMATED_COLUMN: 'sum',
        TOTALS_ESCALATED_COLUMN: 'sum'
    })
)

# create lookup
agg_lookup = agg_grouped.set_index('q_id').to_dict(orient='index')
agg_qids   = set(agg_grouped['q_id'])

# ---------------------- Numeric convert itemized ---------------------- #

for col in ['estimated_cost_x_1000','escalated_cost_x_1000']:
    itemized_df[col] = (
        itemized_df[col]
        .astype(str)
        .str.replace(r'[\$,]', '', regex=True)
        .astype(float)
        .fillna(0.0)
    )

# ---------------------- Check missing upgrades for  Q_ids ---------------------- #

# ---------------------- Check missing upgrades in totals_df (unconditionally) ---------------------- #

print("=== Missing required upgrades in totals dataset ===")
missing = []
for q in sorted(totals_df['q_id'].unique()):
    ups = (
        totals_df
        .loc[totals_df['q_id'] == q, 'type_of_upgrade']
        .dropna()
        .unique()
        .tolist()
    )
    miss = [u for u in REQUIRED_UPGRADES if u not in ups]
    if miss:
        missing.append((q, miss))

if missing:
    for q, miss in missing:
        print(f"Q_id {q} missing: {miss}")
else:
    print("None — every Q_id has all required upgrades in totals_df.")


# ---------------------- Check duplicate upgrades in totals dataset ---------------------- #

print("\n=== Duplicate upgrades in totals dataset ===")
dups = []
for q, group in totals_df.groupby('q_id'):
    dup_types = group['type_of_upgrade'][group['type_of_upgrade'].duplicated()].unique().tolist()
    if dup_types:
        dups.append((q, dup_types))

if dups:
    for q, dup in dups:
        print(f"Q_id {q} duplicates: {dup}")
else:
    print("No duplicates found in totals dataset.")

# ---------------------- Compute per-q_id itemized total ---------------------- #

itemized_totals = (
    itemized_df[itemized_df['q_id'].isin(agg_qids)]
    .groupby('q_id', as_index=False)
    .agg({
        'estimated_cost_x_1000':'sum',
        'escalated_cost_x_1000':'sum'
    })
)

itemized_totals['itemized_total'] = itemized_totals.apply(
    lambda r: r['estimated_cost_x_1000'] if r['estimated_cost_x_1000']>0 else r['escalated_cost_x_1000'],
    axis=1
)

# ---------------------- Compare against aggregate totals ---------------------- #

mismatches = []
for _, row in itemized_totals.iterrows():
    q = row['q_id']
    it = row['itemized_total']
    av = agg_lookup[q][TOTALS_ESTIMATED_COLUMN] if agg_lookup[q][TOTALS_ESTIMATED_COLUMN]>0 else agg_lookup[q][TOTALS_ESCALATED_COLUMN]
    # skip both zero
    if it==0 and av==0:
        continue
    if abs(it - av) > 1e-6:
        mismatches.append({
            'q_id': q,
            'itemized_total': it,
            'aggregate_total': av,
            'difference': it - av
        })

mismatches_df = pd.DataFrame(mismatches)

# ---------------------- Report & Save ---------------------- #

if mismatches_df.empty:
    print("\n✅ All itemized sums match the aggregate totals for Q_ids in aggregate.")
else:
    print(f"\n⚠️  Found {len(mismatches_df)} mismatches:")
    print(mismatches_df)

mismatches_df.to_csv(MISMATCHES_CSV_PATH, index=False)
print(f"\nMismatches written to {MISMATCHES_CSV_PATH}")


=== Missing required upgrades in totals dataset ===
Q_id 569 missing: ['LDNU', 'ADNU']

=== Duplicate upgrades in totals dataset ===
No duplicates found in totals dataset.

✅ All itemized sums match the aggregate totals for Q_ids in aggregate.

Mismatches written to /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 2/mismatches.csv


# Final Data