# Table 8. Style Q

In [47]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback

# Define paths and project range
BASE_DIRECTORY ="/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_scraping_cluster14_style_Q_log.txt"
PROJECT_RANGE = range(1831, 2193)  # Inclusive range for q_ids in Clusters 13

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
style_n_pdfs = []  # List to track style N PDFs
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0
original_has_table7 = {}  # Dictionary to track if original PDFs have table7

def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters, but keeps parentheses."""
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            # collapse internal whitespace
            header = re.sub(r'\s+', ' ', header)
            # strip out everything except letters, digits, spaces, and parentheses
            header = re.sub(r'[^a-z0-9\s\(\)]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    elif value is None:
        return ""
    else:
        return str(value).replace('\n', ' ').strip()
     

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.

    Args:
        title (str): The table title string.

    Returns:
        str: The extracted specific phrase if found, else the original title.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "Other Potential Network Upgrade",
        "Area Delivery Network Upgrades",
        "Conditionally Assigned Network Upgrades",
        "Local Off-Peak Network Upgrade",
        "ADNU",
        "LDNU",
        "RNU"
    ]

    for phrase in phrases:
        if  re.search(rf"\b{re.escape(phrase)}\b(?=\d|\W|$)", title, re.IGNORECASE):
        
         #re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback to the entire title if no specific phrase is found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()

    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()

    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"  # Adjust latitude sign
        longitude = lon if "E" in text.upper() else f"-{lon}"  # Adjust longitude sign
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)

    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def extract_table1(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    Implements a retry mechanism with different table extraction settings if initial attempts fail.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.

    Returns:
        str: Extracted Point of Interconnection value,
             "Value Missing" if label found but no value,
             or None if not found.
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None

    # Define the regex pattern for 'Point of Interconnection' (case-insensitive)
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)

    # Define different table extraction settings to try
    table_settings_list = [
        {
            "horizontal_strategy": "text",
            "vertical_strategy": "lines",
            "snap_tolerance": 1,
        },
        {
            "horizontal_strategy": "lines",
            "vertical_strategy": "lines",
            "snap_tolerance": 2,  # Increased tolerance for retry
        }
    ]

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify all pages that contain "Table 1"
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*1\b", text, re.IGNORECASE):
                    table1_pages.append(i)

            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None  # Return None if no Table 1 found

            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2  # Plus one to include the next page if needed

            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)

            # Flag to indicate if extraction was successful
            extraction_successful = False

            # Iterate through the specified page range
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)

                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)

                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue  # Skip empty tables

                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)

                        # Iterate through each row in the table
                        for row_index, row in enumerate(tab, start=1):
                            # Iterate through each cell in the row
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    # Assuming the next column contains the value
                                    poi_col_index = cell_index  # 1-based index
                                    adjacent_col_index = poi_col_index + 1  # Next column

                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:  # Check if the value is not empty
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break  # Exit the cell loop
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty "
                                                  f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            # Proceed to scan surrounding rows for the value
                                            poi_value_parts = []

                                            # Define the range to scan: two rows above and two rows below
                                            # Convert to 0-based index
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)  # Exclusive

                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)

                                            for scan_row_index in range(start_scan, end_scan):
                                                # Skip the current row where the label was found
                                                if scan_row_index == current_row_idx:
                                                    continue

                                                scan_row = tab[scan_row_index]
                                                # Ensure the adjacent column exists in the scan row
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        # If another POI label is found, skip it
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue

                                            if poi_value_parts:
                                                # Concatenate the parts to form the complete POI value
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' "
                                                      f"(Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break  # Exit the cell loop
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows "
                                                      f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                                # Do not return immediately; proceed to retry
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column "
                                              f"(Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                        # Do not return immediately; proceed to retry
                            if extraction_successful:
                                break  # Exit the row loop
                        if extraction_successful:
                            break  # Exit the table loop
                    if extraction_successful:
                        break  # Exit the attempt loop
                if extraction_successful:
                    break  # Exit the page loop

    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None

    if not extraction_successful:
        # After all attempts, determine the appropriate return value
        if point_of_interconnection is not None and point_of_interconnection != "":
            # Label was found but no value
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            # Label not found
            print("Point of Interconnection not found in Table 1.", file=log_file)
            return None

    return point_of_interconnection

def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text

        text = clean_string_cell(text)

        queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id = queue_id.group(1) if queue_id else str(project_id)  # Use project_id if queue_id is not found
        print(f"Extracted Queue ID: {queue_id}", file=log_file)

        # Updated Cluster Extraction
        clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        if '14' in clusters:
            cluster_number = '14'
        elif clusters:
            cluster_number = max(clusters, key=lambda x: int(x))  # Choose the highest cluster number found
        else:
            cluster_number = '14'  # Default to 12 if not found
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)

        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)

        # Extract Capacity
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)

        # Extract Point of Interconnection
        point_of_interconnection = extract_table1(pdf_path, log_file)

        latitude, longitude = search_gps_coordinates(text, log_file)

        # Initialize base data dictionary
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }

        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)

    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()  # Return empty DataFrame on error

def adjust_rows_length(data_rows, headers):
    """Ensure each row in data_rows matches the length of headers by truncating or padding."""
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""]*(col_count - len(row)))

def extract_table7(pdf_path, log_file, is_addendum=False):
    """
    Extracts Table 7 data from the provided PDF.

    Args:
        pdf_path (str): Path to the PDF file.
        log_file (file object): Log file to write print statements.
        is_addendum (bool): Whether the PDF is an addendum.

    Returns:
        pd.DataFrame: Extracted Table 7 data.
    """
    print(f"\nProcessing {pdf_path} for Table 7 extraction...", file=log_file)
    extracted_tables = []
    specific_phrase = None

    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain "Table 7-1" to "Table 7-5" with hyphen or dot
            table7_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*8[-.]([1-2])\b", text, re.IGNORECASE):
                    table7_pages.append(i)

            if not table7_pages:
                print("No Table 7-1 to 7-6 found in the PDF.", file=log_file)
                return pd.DataFrame()

            first_page = table7_pages[0]
            last_page = table7_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1  # Plus two to include possible continuation

            print(f"Table 7 starts on page {scrape_start + 1} and ends on page {scrape_end}", file=log_file)

            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                })

                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue

                    table_bbox = table.bbox
                    title_bbox = (0, 0, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None

                    if title_text:
                        title_lines = title_text.split('\n')[::-1]
                        for line in title_lines:
                            line = line.strip()
                            match = re.search(r"(Modification\s+of\s+)?Table\s*8[-.]([1-2])[:\-\s]*(.*)", line, re.IGNORECASE)
                            if match:
                                table_title = match.group(3).strip()
                                break

                    if table_title:
                        if re.search(r"\b8-7\b", table_title, re.IGNORECASE):
                            print(f"Skipping Table 7-7 on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            continue  # Skip Table 7-7

                        # New Table 7 detected
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New Table 7 detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)

                        headers = clean_column_headers(tab[0])
                        data_rows = tab[1:]

                        # Create DataFrame for new table
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                        # Handle ADNU-specific grouping
                        if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    # Group all adnu rows into one 'upgrade' row
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()

                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })

                                    for col, value in other_columns.items():
                                        df_grouped[col] = value

                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    # If 'type of upgrade' exists, just rename adnu if needed
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row is none, replace only first row if needed
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' first row for new ADNU table.", file=log_file)
                        else:
                            # Non-ADNU new tables
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                # If 'type of upgrade' exists and first row is none, replace only first row if needed
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for the first row in new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase

                        # Ensure no duplicate columns
                        if df_new.columns.duplicated().any():
                            print("Duplicate columns detected in new table. Dropping duplicates.", file=log_file)
                            df_new = df_new.loc[:, ~df_new.columns.duplicated()]

                        extracted_tables.append(df_new)
                    else:
                        # Continuation Table
                        if not extracted_tables:
                            print(f"No previous Table 7 detected to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue

                        last_table = extracted_tables[-1]
                        expected_columns = last_table.columns.tolist()

                        print(f"Continuation Table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab

                        # Check if the first row is a header row
                        # As per your latest instruction, we will treat all continuation table rows as data points
                        # without any header detection
                        # However, you mentioned checking if there is a header row first, so we'll implement that

                        # Detect if first row is a header
                        header_keywords = ["type of upgrade", "adnu", "MW at POI"]
                        first_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            any(re.search(rf"\b{kw}\b", clean_string_cell(cell).lower()) for kw in header_keywords)
                            for cell in first_row
                        )

                        if is_header_row:
                            # Handle header row in continuation table
                            headers = clean_column_headers(first_row)
                            data_rows = data_rows[1:]  # Exclude header row

                            # Update expected_columns by adding new columns if any
                            new_columns = [col for col in headers if col not in expected_columns]
                            if new_columns:
                                expected_columns.extend(new_columns)
                                print(f"Added new columns from continuation table: {new_columns}", file=log_file)

                            # Create a mapping of new columns to add with default NaN
                            for new_col in new_columns:
                                last_table[new_col] = pd.NA

                            # Reindex last_table to include new columns
                            last_table = last_table.reindex(columns=expected_columns)
                            extracted_tables[-1] = last_table

                            # Update 'type of upgrade' column in the first row if needed
                            if "type of upgrade" in headers:
                                type_upgrade_idx = headers.index("type of upgrade")
                                if pd.isna(data_rows[0][type_upgrade_idx]) or data_rows[0][type_upgrade_idx] == "":
                                    data_rows[0][type_upgrade_idx] = specific_phrase
                                    print(f"Replaced None in 'type of upgrade' first row for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            elif "upgrade" in headers:
                                upgrade_idx = headers.index("upgrade")
                                if pd.isna(data_rows[0][upgrade_idx]) or data_rows[0][upgrade_idx] == "":
                                    data_rows[0][upgrade_idx] = specific_phrase
                                    print(f"Replaced None in 'upgrade' first row for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            else:
                                # If 'type of upgrade' or 'upgrade' does not exist, add it
                                headers.append("type of upgrade")
                                expected_columns.append("type of upgrade")
                                for idx, row in enumerate(data_rows):
                                    data_rows[idx].append(specific_phrase)
                                print(f"Added 'type of upgrade' column and filled with '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

                            # Handle ADNU-specific logic if applicable
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                if "adnu" in headers:
                                    if "upgrade" not in headers:
                                        # Rename 'adnu' to 'upgrade'
                                        adnu_idx = headers.index("adnu")
                                        headers[adnu_idx] = "upgrade"
                                        for row in data_rows:
                                            row[adnu_idx] = " ".join([str(cell) for cell in row[adnu_idx] if pd.notna(cell)])
                                        print("Renamed 'adnu' to 'upgrade' in continuation ADNU table.", file=log_file)
                                # Ensure 'type of upgrade' column is filled
                                if "type of upgrade" not in headers:
                                    headers.append("type of upgrade")
                                    expected_columns.append("type of upgrade")
                                    for row in data_rows:
                                        row.append(specific_phrase)
                                    print("Added 'type of upgrade' column with specific phrase for continuation ADNU table.", file=log_file)

                        else:
                            # No header row detected, treat all rows as data points
                            print(f"No header row detected in continuation table on page {page_number + 1}, table {table_index + 1}. Treating all rows as data.", file=log_file)

                        # Create DataFrame for continuation table
                        if is_header_row:
                            try:
                                df_continuation = pd.DataFrame(data_rows, columns=headers)
                            except ValueError as ve:
                                print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                                continue
                        else:
                            # Create DataFrame with expected_columns
                            # Handle cases where continuation table has more columns
                            standardized_data = []
                            for row in data_rows:
                                if len(row) < len(expected_columns):
                                    # Insert 'type of upgrade' or 'upgrade' with specific_phrase
                                    if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                        # For ADNU tables, assume missing "upgrade" column
                                        missing_cols = len(expected_columns) - len(row)
                                        #row += [specific_phrase] * missing_cols
                                        data_rows = [row[:7] + [specific_phrase] + row[7:] for row in data_rows]
                                        print(f"Inserted '{specific_phrase}' for missing columns in ADNU continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                    else:
                                        # For non-ADNU tables, assume missing "type of upgrade" column
                                        missing_cols = len(expected_columns) - len(row)
                                        #row += [specific_phrase] * missing_cols
                                        data_rows = [ [specific_phrase]  for row in data_rows]
                                        print(f"Inserted '{specific_phrase}' for missing columns in non-ADNU continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                elif len(row) > len(expected_columns):
                                    # Add new columns with default names
                                    extra_cols = len(row) - len(expected_columns)
                                    for i in range(extra_cols):
                                        new_col_name = f"column{len(expected_columns) + 1 + i}"
                                        expected_columns.append(new_col_name)
                                        last_table[new_col_name] = pd.NA
                                        print(f"Added new column '{new_col_name}' for extra data in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                                    row = row[:len(expected_columns)]

                                row_dict = dict(zip(expected_columns, [clean_string_cell(cell) for cell in row]))

                                # Handle 'type of upgrade' column
                                if "type of upgrade" in row_dict and (pd.isna(row_dict["type of upgrade"]) or row_dict["type of upgrade"] == ""):
                                    row_dict["type of upgrade"] = specific_phrase
                                    print(f"Replaced None in 'type of upgrade' for a row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

                                standardized_data.append(row_dict)

                            try:
                                df_continuation = pd.DataFrame(standardized_data, columns=expected_columns)
                            except ValueError as ve:
                                print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                                continue


                             # Special Handling for "Area Delivery Network Upgrade" Tables in Continuation
                            if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                                if "type of upgrade" in df_continuation.columns:
                                    first_row = df_continuation.iloc[0]
                                    if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                        print(f"Replacing 'None' in 'type of upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                        df_continuation.at[0, "type of upgrade"] = specific_phrase
                                else:
                                    # If "type of upgrade" column does not exist, add it
                                    df_continuation["type of upgrade"] = specific_phrase
                                    print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                            else:
                                # General Handling for other tables
                                if "type of upgrade" in df_continuation.columns:
                                    first_row = df_continuation.iloc[0]
                                    if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                        print(f"Replacing 'None' in 'Type of Upgrade' for the first data row of continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)
                                        df_continuation.at[0, "type of upgrade"] = specific_phrase
                                else:
                                    # If "Type of Upgrade" column does not exist, add it
                                    df_continuation["type of upgrade"] = specific_phrase
                                    print(f"'Type of Upgrade' column added with value '{specific_phrase}' for continuation on page {page_number + 1}, table {table_index + 1}",file=log_file)


                        # Handle ADNU-specific logic in continuation tables
                        #if re.search(r"Area\s*Delivery\s*Network\s*Upgrade", specific_phrase, re.IGNORECASE):
                        #    print("Handling ADNU-specific logic in continuation table.", file=log_file)
                        #    if "upgrade" in df_continuation.columns and "adnu" not in df_continuation.columns:
                        #        # Ensure 'upgrade' column is present
                        #        if "upgrade" not in df_continuation.columns:
                        #            df_continuation["upgrade"] = specific_phrase
                        #            print("Added 'upgrade' column to continuation ADNU table.", file=log_file)

                        # Ensure no duplicate columns
                        if df_continuation.columns.duplicated().any():
                            print(f"Duplicate columns detected in continuation table on page {page_number + 1}, table {table_index + 1}. Dropping duplicates.", file=log_file)
                            df_continuation = df_continuation.loc[:, ~df_continuation.columns.duplicated()]

                        # Merge with the last extracted table
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)
                        print(f"Appended continuation table data to the last extracted table on page {page_number + 1}, table {table_index + 1}.", file=log_file)

    except Exception as e:
        print(f"Error processing Table 7 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

    # After processing all tables, concatenate them
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())

        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)

        print("\nConcatenating all extracted Table 7 data...", file=log_file)
        try:
            table7_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table7_data = pd.DataFrame()
    else:
        print("No Table 7 data extracted.", file=log_file)
        table7_data = pd.DataFrame()

    return table7_data



def extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Table 7 data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table7_data = extract_table7(pdf_path, log_file, is_addendum)

    if table7_data.empty:
        return base_data
    else:
        # Identify overlapping columns excluding 'point_of_interconnection'
        overlapping_columns = base_data.columns.intersection(table7_data.columns).difference(['point_of_interconnection'])
        table7_data = table7_data.drop(columns=overlapping_columns, errors='ignore')
        
        # Repeat base data for each row in table7_data
        base_data_repeated = pd.concat([base_data] * len(table7_data), ignore_index=True)
        
        try:
            # Concatenate base data with Table 7 data along columns
            merged_df = pd.concat([base_data_repeated, table7_data], axis=1, sort=False)
            
            # Ensure 'point_of_interconnection' is present and correctly populated
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            
            print(f"Merged base data with Table 7 data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Table 7 data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data  # Fallback to base data only

def check_has_table7(pdf_path):
    """Checks if the PDF contains Table 7-1 to 7-5."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*8[-.]([1-3])\b", text, re.IGNORECASE):
                    return True
    except Exception as e:
        # Handle potential errors when opening PDF
        return False
    return False

def has_network_upgrade_type_column(pdf_path, log_file):
    """Checks if any table in the PDF has a column header 'Network Upgrade Type'."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                tables = page.find_tables()
                for table_index, table in enumerate(tables, start=1):
                    tab = table.extract()
                    if not tab:
                        continue
                    headers = clean_column_headers(tab[0])
                    if "network upgrade type" in headers:
                        print(f"Found 'Network Upgrade Type' in PDF {pdf_path} on page {page_number}, table {table_index}.", file=log_file)
                        return True
    except Exception as e:
        print(f"Error checking 'Network Upgrade Type' in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
    return False

def is_addendum(pdf_path, log_file):
    """Checks if the PDF is an addendum by searching 'Addendum' or 'Revision' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False
            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            print(f"Extracted Text: {text}", file= log_file)  # Debugging step
            # Case-insensitive check for 'Addendum' or 'Revision'
            text_lower = text.lower()
            return "addendum" in text_lower or "revision" in text_lower
    except Exception as e:
        # Handle potential errors when opening PDF
        return False

def make_unique_headers(headers):
    """
    Appends a suffix to duplicate headers to make them unique.

    Args:
        headers (list): List of column headers.

    Returns:
        list: List of unique column headers.
    """
    seen = {}
    unique_headers = []
    for header in headers:
        if header in seen:
            seen[header] += 1
            unique_headers.append(f"{header}_{seen[header]}")
        else:
            seen[header] = 1
            unique_headers.append(header)
    return unique_headers

def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped

    SKIP_PROJECTS = {1860, 2003, 2006}

    # Ensure the log file directory exists
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)

    with open(LOG_FILE_PATH, 'w') as log_file:


        for project_id in PROJECT_RANGE:
            # Skip the projects in the SKIP_PROJECTS set
            if project_id in SKIP_PROJECTS:
                print(f"Skipping Project {project_id} (marked to skip)", file=log_file)
                continue

         
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "03_phase_2_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue

            project_scraped = False  # Flag to track if any PDF in the project was scraped
            base_data_extracted = False
            base_data = pd.DataFrame()

            # **START OF CHANGES**
            # Separate PDFs into originals and addendums
            list_pdfs = [pdf for pdf in os.listdir(project_path) if pdf.endswith(".pdf")]
            originals = []
            addendums = []
            for pdf_name in list_pdfs:
                pdf_path = os.path.join(project_path, pdf_name)
                if is_addendum(pdf_path, log_file):
                    addendums.append(pdf_name)
                else:
                    originals.append(pdf_name)
            # **END OF CHANGES**

            # **START OF CHANGES**
            # Process original PDFs first
            for pdf_name in originals:
                
                pdf_path = os.path.join(project_path, pdf_name)
                total_pdfs_accessed += 1

                is_add = is_addendum(pdf_path, log_file)

                # Check if PDF has 'Network Upgrade Type' column
                if has_network_upgrade_type_column(pdf_path, log_file):
                    style_n_pdfs.append(pdf_name)
                    print(f"Skipping PDF: {pdf_name} from Project {project_id} (Style N)", file=log_file)
                    # Still check if original has table7
                    has_table7 = check_has_table7(pdf_path)
                    original_has_table7[project_id] = has_table7
                    continue  # Skip processing this PDF

                print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)
                original_pdfs.append(pdf_name)

                try:
                    has_table7 = check_has_table7(pdf_path)
                    original_has_table7[project_id] = has_table7

                    if not has_table7:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)", file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 7)")
                        total_pdfs_skipped += 1
                        continue

                    if not base_data_extracted:
                        # Extract base data from original PDF
                        base_data = extract_base_data(pdf_path, project_id, log_file)
                        base_data_extracted = True
                        print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                    # Extract Table 7 and merge
                    df = extract_table7_and_replace_none(pdf_path, project_id, log_file, is_addendum=False)
                    if not df.empty:
                        core_originals = pd.concat([core_originals, df], ignore_index=True)
                        scraped_pdfs.append(pdf_name)
                        scraped_projects.add(project_id)
                        project_scraped = True
                        total_pdfs_scraped += 1
                        print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                    else:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                        total_pdfs_skipped += 1

                except Exception as e:
                    skipped_pdfs.append(pdf_name)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                    print(traceback.format_exc(), file=log_file)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                    total_pdfs_skipped += 1
            # **END OF CHANGES**

            # **START OF CHANGES**
            # Then process addendum PDFs
            for pdf_name in addendums:
                pdf_path = os.path.join(project_path, pdf_name)
                total_pdfs_accessed += 1
                is_add = is_addendum(pdf_path, log_file)

                # Check if PDF has 'Network Upgrade Type' column
                if has_network_upgrade_type_column(pdf_path, log_file):
                    style_n_pdfs.append(pdf_name)
                    print(f"Skipping PDF: {pdf_name} from Project {project_id} (Style N)", file=log_file)
                    continue  # Skip processing this PDF

                print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                addendum_pdfs.append(pdf_name)

                try:
                    has_table7 = check_has_table7(pdf_path)

                    if not has_table7:
                        if original_has_table7.get(project_id, False):
                            # Attempt to scrape alternative tables is no longer needed
                            # According to the latest request, alternative table scraping is removed
                            # Therefore, we skip addendum PDFs that do not have Table 7
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 7)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 7)")
                            total_pdfs_skipped += 1
                        else:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 7 and original does not have Table 7)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No Table 7 and original does not have Table 7)")
                            total_pdfs_skipped += 1
                        continue

                    if not is_add and not base_data_extracted:
                        # Extract base data from original PDF
                        base_data = extract_base_data(pdf_path, project_id, log_file)
                        base_data_extracted = True
                        print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)

                    if is_add and base_data_extracted:
                        # For addendums, use the extracted base data
                        table7_data = extract_table7(pdf_path, log_file, is_addendum=is_add)
                        if table7_data.empty and original_has_table7.get(project_id, False):
                            # Scrape alternative tables is removed, so skip if no data
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No relevant tables found)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (No relevant tables found)")
                            total_pdfs_skipped += 1
                        if not table7_data.empty:
                            # Merge base data with Table 7 data
                            merged_df = pd.concat([base_data] * len(table7_data), ignore_index=True)
                            merged_df = pd.concat([merged_df, table7_data], axis=1, sort=False)
                            core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                            scraped_pdfs.append(pdf_name)
                            scraped_projects.add(project_id)
                            project_scraped = True
                            total_pdfs_scraped += 1
                            print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                        else:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                            print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                            total_pdfs_skipped += 1
                except Exception as e:
                    skipped_pdfs.append(pdf_name)
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                    print(traceback.format_exc(), file=log_file)
                    # Optionally, print to ipynb
                    print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                    total_pdfs_skipped += 1
            # **END OF CHANGES**

            # After processing all PDFs for this project, check if any PDF was scraped
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)

    # Rest of the code remains unchanged...

    # After processing all PDFs, save to CSV
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")

    # Calculate total projects processed
    total_projects_processed = len(scraped_projects) + len(skipped_projects)

    # Print summary to ipynb
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")

    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))

    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))

    print("\nList of Missing Projects:")
    print(sorted(missing_projects))

    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)

    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)

    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)

    print("\nList of Original PDFs:")
    print(original_pdfs)

    print("\nList of Style N PDFs (Skipped due to 'Network Upgrade Type'):")
    print(style_n_pdfs)

    print("\nTotal Number of Style N PDFs:", len(style_n_pdfs))

    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return

    # Clean up the entire DataFrame by cleaning string cells
    df = df.applymap(clean_string_cell)

    # Drop rows that contain specific phrases (e.g., "Type of Upgrade")
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]

    # Reorder columns as specified
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")

    # Ensure q_id is numeric for sorting, replace missing values with None
    if 'q_id' in df.columns:
        df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')

    # Save the DataFrame to CSV
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Scraped PDF: C14_Q1832_GoalLineReliability_PhII_01-30-2024.pdf from Project 1832
Skipped Addendum PDF: Q1859Mayacamas_GeothermalAppendix_AC14PhII_Revision1.pdf from Project 1859 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1871Stageline_Energy_StorageAppendix_AC14PhIIAddendum1.pdf from Project 1871 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1875Delilah_Energy_StorageAppendix_AC14PhIIRevision1.pdf from Project 1875 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1881Spectrum_Energy_StorageAppendix_AC14PhIIRevision1.pdf from Project 1881 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1932-Cougar Storage-Appendix_A-C14PhII-Revision1.pdf from Project 1932 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1954Huaso_HybridAppendix_AC14PhIIAddendum1.pdf from Project 1954 (No Table 7 and original does not have Table 7)
Skipped Addendum PDF: Q1956SequoiaAppendix_AC14PhIIAdde

  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase


Scraped PDF: C14_Q2162_SolarDeMexicali_PhII_01-30-2024.pdf from Project 2162
Scraped PDF: C14_Q2165_Tower1EnergyStorage_PhII_01-30-2024.pdf from Project 2165


  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase


Scraped PDF: C14_Q2166_Umbriel_PhII_01-30-2024.pdf from Project 2166
Scraped PDF: C14_Q2167_HammerheadStorage_PhII_01-30-2024.pdf from Project 2167
Scraped PDF: C14_Q2172_Hyder_PhII_01-30-2024.pdf from Project 2172
Scraped PDF: C14_Q2173_LagoDomingoStorage_PhII_01-30-2024.pdf from Project 2173
Skipped PDF: C4PhII_Q2173_Attachment 1.pdf from Project 2173 (No Table 7)


  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase


Scraped PDF: C14_Q2176_YuhaDesertBatteryStorage_PhII_01-30-2024.pdf from Project 2176
Scraped PDF: C14_Q2177_BoulderBrushHybrid_PhII_01-30-2024.pdf from Project 2177
Scraped Addendum PDF: P2RPT-C14_Q2177_BoulderBrushHybrid_PhII_Addendum2_32624.pdf from Project 2177
Scraped Addendum PDF: P2RPT-C14_Q2177_BoulderBrushHybrid_PhII_Addendum_1_03072024.pdf from Project 2177
Scraped PDF: C14_Q2178_BellBluffStorage_PhII_01-30-2024.pdf from Project 2178
Scraped PDF: C14_Q2180_CargoStorage_PhII_01-30-2024.pdf from Project 2180
Scraped PDF: C14_Q2181_PinscherEnergyStorage_PhII_01-30-2024.pdf from Project 2181
Skipped PDF: C4PhII_Q2181_Attachment 1.pdf from Project 2181 (No Table 7)
Scraped Addendum PDF: P2RPT-C14_Q2181_PinscherEnergyStorage_PhII_Addendum1_4222024.pdf from Project 2181
Scraped PDF: C14_Q2182_TaylorStorage_PhII_01-30-2024.pdf from Project 2182
Scraped PDF: C14_Q2184_AmberjackEnergy_PhII_01-30-2024.pdf from Project 2184
Scraped PDF: C14_Q2185_GatewayEnergyStorage2_PhII_01-30-2024.pdf

  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase
  df_continuation.at[0, "type of upgrade"] = specific_phrase


Scraped PDF: C14_Q2187_EolicaDeRumorosa_PhII_01-30-2024.pdf from Project 2187
Scraped PDF: C14_Q2188_GeraniumEnergyStorage_PhII_01-30-2024.pdf from Project 2188
Scraped Addendum PDF: P2RPT-C14_Q2188_GeraniumEnergyStorage_PhII_Addendum1_3142024.pdf from Project 2188
Scraped PDF: C14_Q2192_PajaroValleyStorage_PhII_01-30-2024.pdf from Project 2192

Columns reordered for originals as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_originals.csv

Columns reordered for addendums as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_addendums.csv

=== Scraping Summary ===
Total Projects Processed: 297
Total Projects Scraped: 23
Total Projects Skipped: 274
Total Projects Missing: 62
Total

  df = df.applymap(clean_string_cell)
  df = df.applymap(clean_string_cell)


# Itemized and Totals

# Originals

In [94]:
import pandas as pd
import re
import unicodedata
import numpy as np

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_originals.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')
df['q_id'] = df['q_id'].replace(1170, 2185)


######################################################################################################################################
########################################
# STEP 0: CREATE DESCRIPTION COLUMN FROM COST ALLOCATION FACTOR


def move_non_numeric_text(value):
    """Move non-numeric, non-percentage text from cost allocation factor to description.
       If a value is moved, return None for cost allocation factor."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return value  # Keep numeric or percentage values
        return None  # Clear the value if it's text (moved to description)
    return value  # Return as is for non-string values


def extract_non_numeric_text(value):
    """Extract non-numeric, non-percentage text from the cost allocation factor column."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return None
        return value.strip()  # Return text entries as is
    return None  # Return None for non-string values



def clean_total_entries(value):
    """If the value starts with 'Total', remove numbers, commas, and percentage signs, keeping only 'Total'."""
    if isinstance(value, str) and value.startswith("Total"):
        return "Total"  # Keep only "Total"
    return value  # Leave other values unchanged


import re
import numpy as np

def move_dollar_values(df, src_col, dst_col):
    """
    Moves pure dollar amounts (e.g. "$3625.89", "$3,300.00", "$1,084.62") 
    from src_col into dst_col (as numeric), and blanks out src_col.
    """
    # 1) Column existence check
    if src_col not in df.columns or dst_col not in df.columns:
        raise KeyError(f"Columns {src_col!r} or {dst_col!r} not found in DataFrame")

    # 2) Trim whitespace
    s = df[src_col].astype(str).str.strip()

    # 3) Match patterns like "$123", "$1,234", "$12,345.67"
    pattern = r"^\$\d{1,3}(?:,\d{3})*(?:\.\d{1,2})?$"
    mask = s.str.match(pattern, na=False)

    # 4) Move into dst_col (stripped of "$" and ",", cast to float)
    cleaned = (
        s[mask]
        .str.replace(r"[\$,]", "", regex=True)
        .astype(float)
    )
    df.loc[mask, dst_col] = cleaned

    # 5) Blank out the original
    df.loc[mask, src_col] = np.nan

    return df







 


df = df[~df.apply(lambda row: row.astype(str).isin(["Allocated","Cost", "(Escalated","$k)","(Note 1)", "Total NU", "Cost (2023", "Project", "Allocation", "Upgrade", "Total NU Cost (2023 $k)",
                                                    "(2023 $k)", "Time to", "Construct", "(Months)", "(Notes 2", "and 4)"
                                                     ]).any(), axis=1)]




# Create the 'description' column from 'cost allocation factor'
#if 'cost allocation factor' in df.columns:
#    df['description'] = df['cost allocation factor'].apply(extract_non_numeric_text)
#    df['cost_allocation_factor'] = df['cost allocation factor'].apply(move_non_numeric_text)  # Clear moved values

######################################################################################################################################
########################################
#STEP 1 MERGE COLUMNS

def merge_columns(df):
    merge_columns_dict = {

        "upgrade": [
            "project",
            "upgrade",
            
            "unnamed_2" ],

        "estimated_cost_x_1000": [

            "estimated cost x 1000",
            "estimated cost x 1000 constant",
            "allocated cost (2023k)",
            "allocated_cost",
            "assigned cost",
            "allocated cost",
            "sum of allocated constant cost",
            "column13",
            "column17",
            "allocated cost (2023 k)",
            "allocated cost (2023 k) (note 1)",
            "allocated cost (2022 k)",
            "allocated cost (2023k)"
            

        ],    


        "escalated_cost_x_1000": [
            "escalated costs x 1000",
            "estimated cost x 1000 escalated",
            "allocated cost escalated",
            "estimated cost x 1000 escalated without itcca",
            "escalated cost x 1000",
            "allocated cost (escalated k) (note 1)"

            "sum of allocated escalated cost",
            "assigned cost escalated",
             "column19",
             "column23",
             "estimated cost x 1000 escalated (note 1)",
             "allocated cost (escalated k) (note 1)",
             


        ],

        "estimated_time_to_construct": [
            "estimated time to construct",
            "estimated time  to construct",
             "column27",
             "column25",
             "column29",
             "estimated time to construct (months) (notes 2 and 4)",
             "estimated time to construct (months) (note 3)",
             "estimated time to construct (months) (notes 2 and 5)",
             "estimated time to construct (months) (note 2 and 5)",


        ],

        "total_estimated_cost_x_1000": [
            "total nu cost",
            "total cost constant",
            "column5",
            
            "total nu cost (2023 k)",
            "total nu cost (2022 k)"
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated",
            "total nu cost (escalated k) (note 1)",
            "total nu cost (escalate d k) (note 1)"
        ],
       
        "adnu_cost_rate_x_1000": [
            "adnu cost rate x 1000",
            "cost rate x 1000",
            "cost rate",
            "cost rate constant"
        ],

        "adnu_cost_rate_escalated_x_1000": [
        "cost rate escalated",
        ],

        "description": ["description"],

        "capacity": [
            "capacity",
            "project size",
            "project mw",
            "mw at poi"
        ],
        "cost_allocation_factor": [
            "cost allocation factor",
            "cost allocatio n factor",
            "cost allocati on factor",
            "project allocation",
            "column7",
            "column11"

        ],
        "estimated cost x 1000 escalated with itcca": [
            "estimated cost x 1000 escalated with itcca",

        ]
    }

    # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["description"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)



######################################################################################################################################
########################################
# STEP 2: REMOVE DOLLAR SIGNED VALUES FROM 'estimated_time_to_construct'
######## Other clean up

def remove_dollar_values(value):
    """Remove dollar amounts (e.g., $3625.89, $3300) from 'estimated_time_to_construct'."""
    if isinstance(value, str) and re.search(r"^\$\d+(\.\d{1,2})?$", value.strip()):
        return None  # Replace with None if it's a dollar-signed number
    return value.strip() if isinstance(value, str) else value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(remove_dollar_values)

  


## Remove ranodm number in Total row:    
# Apply cleaning function to "upgrade" column after merging
if 'upgrade' in df.columns:
    df['upgrade'] = df['upgrade'].apply(clean_total_entries)


#if 'cost_allocation_factor' in df.columns:
#    df['description'] = df['cost_allocation_factor'].apply(extract_non_numeric_text)
#    df['cost_allocation_factor'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values    


# Clear cost_allocation_factor for rows where upgrade is "Total" (case-insensitive)
df.loc[df['upgrade'].str.lower() == 'total', 'cost_allocation_factor'] = None  # or use "" if you prefer an empty string


 

#df['description'] = df['description'].apply(remove_dollar_values)


 
    
######################################################################################################################################
########################################
# STEP 3: DROP UNNEEDED COLUMNS

#df.drop(['unnamed_3', 'unnamed_15', 'unnamed_18', 'unnamed_16', 'estimated cost x 1000 escalated with itcca'], axis=1, inplace=True, errors='ignore')

df.drop(['total nu'	,'cost', 'estimated', 'allocated', '2154','estimated cost x', 
           'ks remy', 'estimated time to', '2161', 'alisa solar energy complex',
         'column21', 'column3', 'column9',	'column15', 'column33',	'column31',	'cost allocation', "allocated cost (escalated k) with itcca (note 2)", "estimated cost x 1000 escalated with itcca (note 2)"], axis=1, inplace=True)



######################################################################################################################################
########################################
#STEP 4: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value



df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade']= df['type_of_upgrade'].apply(remove_dollar_values)




df = move_dollar_values(df, 'description','escalated_cost_x_1000')


# Convert estimated_time_to_construct to integer (remove decimals) and keep NaNs as empty
df['estimated_time_to_construct'] = pd.to_numeric(df['estimated_time_to_construct'], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else None)


 


def move_description_to_upgrade(df):
    """Moves description to upgrade if type_of_upgrade is 'PTO' and upgrade is empty."""
    
    # Ensure columns are strings and replace NaNs with empty strings for processing
    df['upgrade'] = df['upgrade'].astype(str).replace("nan", "").fillna("")
    df['description'] = df['description'].astype(str).replace("nan", "").fillna("")

    # Debug: Print before update
    #print("Before update (only PTO rows):")
    #print(df[df['type_of_upgrade'] == 'PTO'][['type_of_upgrade', 'upgrade', 'description']])

    # Apply row-wise transformation
    def move_if_empty(row):
        if row['type_of_upgrade'] == 'PTO' and row['upgrade'].strip() == "" and row['description'].strip() != "":
            row['upgrade'] = row['description']  # Move description to upgrade
            row['description'] = None # Clear description
        return row

    df = df.apply(move_if_empty, axis=1)

    # Debug: Print after update
    #print("\nAfter update (only PTO rows):")
    #print(df[df['type_of_upgrade'] == 'PTO'][['type_of_upgrade', 'upgrade', 'description']])

    return df

# Apply function
#df = move_description_to_upgrade(df)


# Your list of upgrade phrases
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]

def simple_move_upgrades(df, desc_col='description', upg_col='upgrade'):
    # Ensure the upgrade column exists
    df[upg_col] = df.get(upg_col, None)

    # Iterate row by row
    for i, desc in df[desc_col].fillna("").items():
        for ph in upgrade_phrases:
            if ph in desc:
                # Set the upgrade column to the found phrase
                df.at[i, upg_col] = ph
                # Remove the phrase from the description
                df.at[i, desc_col] = desc.replace(ph, "").strip()
                break  # stop after the first match

    return df


df = simple_move_upgrades(df)


df = df[df['upgrade'].fillna('').astype(str).str.strip() != '']

######################################################################################################################################
########################################
#STEP 5: REMOVING TOTAL ROW, AS THE PDFS GIVE TOTAL NETWORK COST RATHER THAN BY RNU, LDNU AS WE HAD BEFORE
# Remove rows where upgrade is "Total" (case-insensitive)

df['tot'] = df.apply(
    lambda row: 'yes' if (
        (pd.notna(row.get('upgrade')) and 'Total' in str(row['upgrade']))  
    ) else 'no',
    axis=1
) 

# Now extract ONLY "Total" rows with a foolproof match
total_rows_df = df[df['tot'] == 'yes']

total_rows_df = total_rows_df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()
total_rows_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total_network.csv', index=False) 
df = df[df['upgrade'].str.strip().str.lower() != 'total']
df.drop('tot', axis=1, inplace= True)




######################################################################################################################################
########################################
# STEP 6: Move upgrade phrases like IRNU from upgrade column to a new column upgrade_classificatio and also replace type_of_upgrade with LDNU, CANU



# Define the list of phrases for upgrade classification
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]




def classify_and_fill_upgrades(df):
    """Moves upgrade phrases to upgrade_classification, forward-fills (including type_of_upgrade), and removes redundant rows."""

    # 1) Normalize
    df['upgrade'] = df['upgrade'].astype(str).fillna("")
    df['upgrade_classification'] = None

    # 2) Mark the phrase‐rows
    df.loc[df['upgrade'].isin(upgrade_phrases), 'upgrade_classification'] = df['upgrade']

    # 3) Single pass per‐q_id ffill
    def ffill_qid(group):
        current = None
        for idx in group.index:
            up = group.at[idx, 'upgrade'].strip()

            if up in upgrade_phrases:
                current = up
            elif up.lower() == "total":
                current = None

            # fill classification
            group.at[idx, 'upgrade_classification'] = current

            # forward‐fill type_of_upgrade whenever it’s blank
            if not group.at[idx, 'type_of_upgrade'] or pd.isna(group.at[idx, 'type_of_upgrade']):
                group.at[idx, 'type_of_upgrade'] = current or ""

        return group

    df = df.groupby('q_id', group_keys=False).apply(ffill_qid)

    # 4) drop the original phrase‐marker rows
    df = df[~df['upgrade'].isin(upgrade_phrases)]

    # 5) finally, coerce the “master” classification back into type_of_upgrade
    df.loc[df['upgrade_classification'] == 'LDNU',                     'type_of_upgrade'] = 'LDNU'
    df.loc[df['upgrade_classification'].isin(['CANU-GR','CANU']),      'type_of_upgrade'] = 'CANU'
    df.loc[df['upgrade_classification'] == 'PNU',                      'type_of_upgrade'] = 'PNU'
    df.loc[df['upgrade_classification'].isin(['IRNU','GRNU','IRNU-A']), 'type_of_upgrade'] = 'RNU'

    return df

# Usage
df = classify_and_fill_upgrades(df)






df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_cluster_14_style_Q.csv', index=False)





mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "IRNU": 'RNU',
'Total PTO_IF': 'PTO_IF',
 'Total RNU': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
 "Total IRNU": 'RNU',
}


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df     


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )



######################################################################################################################################
########################################
#STEP 7: Stable sort type of upgrade

def stable_sort_by_type_of_upgrade(df):
    """Performs a stable sort within each q_id to order type_of_upgrade while preserving row order in other columns."""
    
    # Define the custom sorting order for type_of_upgrade
    type_order = {"PTO_IF": 1, "RNU": 2, "LDNU": 3, "PNU": 4, "ADNU": 5}

    # Assign a numerical sorting key; use a high number if type_of_upgrade is missing
    df['sort_key'] = df['type_of_upgrade'].map(lambda x: type_order.get(x, 99))

    # Perform a stable sort by q_id first, then by type_of_upgrade using the custom order
    df = df.sort_values(by=['q_id', 'sort_key'], kind='stable').drop(columns=['sort_key'])

    return df

# Apply stable sorting
  


df = df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()


df= reorder_columns(df)

df = stable_sort_by_type_of_upgrade(df)  
df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)
######################################################################################################################################
########################################
# STEP 8: Remove $ signs and convert to numeric
 

def clean_currency(value):
    """
    Cleans a string by removing:
      - Dollar signs ($)
      - Asterisks (*)
      - Any parenthesized content "(...)" (Notes or otherwise)
      - Commas
    Then converts to a numeric float, returning pd.NA on failure.
    """
    if isinstance(value, str):
        # 1) Remove $ and *
        value = value.replace('$', '').replace('*', '')
        # 2) Remove anything in parentheses, e.g. "(Note 6 and 7)"
        value = re.sub(r'\([^)]*\)', '', value)
        # 3) Remove commas and trim spaces
        value = value.replace(',', '').strip()
    # 4) Convert to numeric, coercing invalid to NaN
    return pd.to_numeric(value, errors='coerce')
    




# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000_escalated', 'total_estimated_cost_x_1000', 'adnu_cost_rate_x_1000', 'adnu_cost_rate_x_1000_escalated']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)

 

import re
import pandas as pd

def move_time_numbers(df, description_col='description', time_col='estimated_time_to_construct'):
    """
    Moves standalone integer values (e.g., 35, 9, 12) from `description_col`
    into `time_col`, and blanks out the original description cell.
    Only moves cells that are purely an integer, whether stored as a number
    or as a digit‐only string.
    """
    # 1) sanity check
    if description_col not in df.columns or time_col not in df.columns:
        raise KeyError(f"Columns {description_col!r} or {time_col!r} not found in DataFrame")
    
    # 2) helper: detect a “pure” integer cell
    def is_pure_int(val):
        if pd.isna(val):
            return False
        # if it’s actually numeric, check it’s an integer
        if isinstance(val, (int, float)):
            return float(val).is_integer()
        # if it’s a string, strip and see if it’s all digits
        if isinstance(val, str):
            s = val.strip()
            return bool(re.fullmatch(r"\d+", s))
        return False
    
    # 3) build mask
    mask = df[description_col].apply(is_pure_int)
    
    # 4) Move values
    #   * cast numeric → int → str so our time_col is consistently string/int
    times = df.loc[mask, description_col].apply(lambda v: str(int(float(v))) if not pd.isna(v) else "")
    df.loc[mask, time_col] = times
    
    # 5) blank out description
    df.loc[mask, description_col] = ""
    
    return df


df = move_time_numbers(df, 'description', 'estimated_time_to_construct')







######################################################################################################################################
########################################
# STEP 9: Create Total rows


df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)


  

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000_escalated', 'adnu_cost_rate_x_1000_escalated']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    print(f"\nProcessing q_id: {q_id}")  # Debug print
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

        # Debug: Print current group
        print(f"\nChecking Upgrade: {upgrade}, Total Rows Present?:", 
              ((group['type_of_upgrade'] == f"Total {upgrade}") & (group['item'] == 'no')).any())

        # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = ((group['type_of_upgrade'] == f"Total {upgrade}") & (group['item'] == 'no')).any()
        
        if total_exists:
            print(f"Skipping Total row for {upgrade} (already exists).")
            continue
        
        total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings
        total_row['q_id'] = q_id
        total_row['type_of_upgrade'] = f"Total {upgrade}"
        total_row['item'] = 'no'

        # Populate specified columns from the existing row
        first_row = rows.iloc[0]
        for col in columns_to_populate:
            if col in df.columns:
                total_row[col] = first_row[col]

        # Sum the numeric columns
        for col in columns_to_sum:
            if col in rows.columns:
                total_row[col] = rows[col].sum()
            else:
                total_row[col] = 0  # Default to 0 if column is missing

        print(f"Creating Total row for {upgrade}")  # Debug print
        new_rows.append(total_row)

# Convert list to DataFrame and append
if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    print("\nNew Total Rows Created:\n", total_rows_df)  # Debug print
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


df = stable_sort_by_type_of_upgrade(df)

 


#: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]




#  Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )
    

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )



def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


   


df= reorder_columns(df)


# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    totals_df = df[df['item'] == 'no'].drop(columns=totals_columns, errors='ignore')
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_2_cluster_14_style_Q_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_2_cluster_14_style_Q_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())


#df.to_csv('Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 14/03_raw/rawdata_cluster14_style_Q.csv')
#



Processing q_id: 1832

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 2153

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 2157

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 2161

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 2162

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Checking Upgrade: CANU, Total Rows Present?: False
Creating Total row for CANU

Processing q_id: 2165

Checking Upgrade: PTO_IF, Total Rows Present?: False
Cr

  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df = df.groupby('q_id', group_keys=False).apply(ffill_qid)
  df.loc[mask, time_col] = times


# Addendum

In [95]:
import pandas as pd
import re
import unicodedata
import numpy as np

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_rawdata_cluster14_style_Q_addendums.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')
df['q_id'] = df['q_id'].replace(1170, 2185)


######################################################################################################################################
########################################
# STEP 0: CREATE DESCRIPTION COLUMN FROM COST ALLOCATION FACTOR


def move_non_numeric_text(value):
    """Move non-numeric, non-percentage text from cost allocation factor to description.
       If a value is moved, return None for cost allocation factor."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return value  # Keep numeric or percentage values
        return None  # Clear the value if it's text (moved to description)
    return value  # Return as is for non-string values


def extract_non_numeric_text(value):
    """Extract non-numeric, non-percentage text from the cost allocation factor column."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return None
        return value.strip()  # Return text entries as is
    return None  # Return None for non-string values



def clean_total_entries(value):
    """If the value starts with 'Total', remove numbers, commas, and percentage signs, keeping only 'Total'."""
    if isinstance(value, str) and value.startswith("Total"):
        return "Total"  # Keep only "Total"
    return value  # Leave other values unchanged


def move_dollar_values(df, src_col, dst_col):
    """
    For each row in df:
      – If df[src_col] is exactly a dollar amount (e.g. "$3625.89", "$3300"), 
        move that string into df[dst_col] and set df[src_col] = NaN.
      – Otherwise, leave both cols alone.
    """
    # Ensure both columns exist
    if src_col not in df or dst_col not in df:
        raise KeyError(f"Columns {src_col!r} or {dst_col!r} not in DataFrame")

    # Define mask of “pure” dollar amounts
    is_dollar = df[src_col].astype(str).str.match(r"^\$\d+(?:\.\d{1,2})?$")

    # Move
    df.loc[is_dollar, dst_col] = df.loc[is_dollar, src_col]
    df.loc[is_dollar, src_col] = np.nan

    return df


#df = move_dollar_values(df, 
#                        src_col='type of upgrade', 
#                        dst_col='allocated cost')


 


df = df[~df.apply(lambda row: row.astype(str).isin(["Allocated","Cost", "(Escalated","$k)","(Note 1)", "Total NU", "Cost (2023", "Project", "Allocation", " Upgrade", "Total NU Cost (2023 $k)" ]).any(), axis=1)]




# Create the 'description' column from 'cost allocation factor'
#if 'cost allocation factor' in df.columns:
#    df['description'] = df['cost allocation factor'].apply(extract_non_numeric_text)
#    df['cost_allocation_factor'] = df['cost allocation factor'].apply(move_non_numeric_text)  # Clear moved values

######################################################################################################################################
########################################
#STEP 1 MERGE COLUMNS

def merge_columns(df):
    merge_columns_dict = {

        "upgrade": [
            "upgrade",
            "project",
            "unnamed_2" ],

            "estimated_cost_x_1000": [

            "estimated cost x 1000",
            "estimated cost x 1000 constant",
            "allocated_cost",
            "assigned cost",
            "allocated cost",
            "sum of allocated constant cost",
            "column13",
            "column17",
            "allocated estimated cost",
            "allocated cost (2023 k)",
        ],    


        "escalated_cost_x_1000": [
            "escalated costs x 1000",
            "estimated cost x 1000 escalated",
            "allocated cost escalated",
            "estimated cost x 1000 escalated without itcca",
            "escalated cost x 1000",

            "sum of allocated escalated cost",
            "assigned cost escalated",
             "column19",
             "column23",
             "estimated cost",
             "estimated cost (escalated k) (note 1)",
             "allocated estimated cost (escalated k) (note 1)",
             "allocated cost (escalated k) (note 1)",


        ],

        "estimated_time_to_construct": [
            "estimated time to construct",
            "estimated time  to construct",
             "column27",
             "column25",
             "column29",
             "estimated time to construct (months) (note 3)",


        ],

        "total_estimated_cost_x_1000": [
            "total nu cost",
            "total cost constant",
            "column5",
            "total nu cost (2023 k)",
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated",
            "total nu cost (escalated k) (note 1)",
        ],
       
        "adnu_cost_rate_x_1000": [
            "adnu cost rate x 1000",
            "cost rate x 1000",
            "cost rate",
            "cost rate constant"
        ],

        "adnu_cost_rate_escalated_x_1000": [
        "cost rate escalated",
        ],

        "description": ["description"],

        "capacity": [
            "capacity",
            "project size",
            "project mw",
            "mw at poi"
        ],
        "cost_allocation_factor": [
            "cost allocation factor",
            "cost allocatio n factor",
            "cost allocati on factor",
            "project allocation",
            "column7",
            "column11"

        ],
        "estimated cost x 1000 escalated with itcca": [
            "estimated cost x 1000 escalated with itcca",

        ]
    }

    # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["description"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)


######################################################################################################################################
########################################
# STEP 2: REMOVE DOLLAR SIGNED VALUES FROM 'estimated_time_to_construct'
######## Other clean up

def remove_dollar_values(value):
    """Remove dollar amounts (e.g., $3625.89, $3300) from 'estimated_time_to_construct'."""
    if isinstance(value, str) and re.search(r"^\$\d+(\.\d{1,2})?$", value.strip()):
        return None  # Replace with None if it's a dollar-signed number
    return value.strip() if isinstance(value, str) else value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(remove_dollar_values)

  


## Remove ranodm number in Total row:    
# Apply cleaning function to "upgrade" column after merging
if 'upgrade' in df.columns:
    df['upgrade'] = df['upgrade'].apply(clean_total_entries)


#if 'cost_allocation_factor' in df.columns:
#    df['description'] = df['cost_allocation_factor'].apply(extract_non_numeric_text)
#    df['cost_allocation_factor'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values    


# Clear cost_allocation_factor for rows where upgrade is "Total" (case-insensitive)
df.loc[df['upgrade'].str.lower() == 'total', 'cost_allocation_factor'] = None  # or use "" if you prefer an empty string


 



 
    
######################################################################################################################################
########################################
# STEP 3: DROP UNNEEDED COLUMNS

#df.drop(['unnamed_3', 'unnamed_15', 'unnamed_18', 'unnamed_16', 'estimated cost x 1000 escalated with itcca'], axis=1, inplace=True, errors='ignore')

cols = [
    'total nu', 'cost', 'estimated', 'allocated', 'allocated cost  with itcca',
    '2154', 'estimated cost x', 'estimated cost x 1000 escalated with itcca',
    'ks remy', 'estimated time to', '2161', 'alisa solar energy complex',
    'column21', 'column3', 'column9', 'column15', 'column33', 'column31',
    'cost allocation'
]

df.drop(columns=cols, errors='ignore', inplace=True)




######################################################################################################################################
########################################
#STEP 4: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value



df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade']= df['type_of_upgrade'].apply(remove_dollar_values)




def clean(value):
    """
    Cleans a string by removing:
      - Dollar signs: $
      - Asterisks: *
      - Any parenthetical notes like "(Note 2)" or "(Notes 6 and 7)"
      - Any other parenthesized content
    Then attempts to convert to a numeric (float). Returns pd.NA if invalid.
    """
    if isinstance(value, str):
    
        # 2) remove any "(Note ...)" patterns
        value = re.sub(r'\(Note[s]?\s*\d+(?:\s*and\s*\d+)*\)', '', value)
        # 3) remove any other parentheses and their contents
        value = re.sub(r'\(.*?\)', '', value)
        # 4) strip leading/trailing whitespace
        value = value.strip()

    try:
        # Try numeric conversion
        return value
    except (ValueError, TypeError):
        return pd.NA
    
df['description'] = df['description'].apply(clean)  

df['cost_allocation_factor'] = df['cost_allocation_factor'].apply(clean)

df['description'] = df['description'].apply(remove_dollar_values)



# Convert estimated_time_to_construct to integer (remove decimals) and keep NaNs as empty
df['estimated_time_to_construct'] = pd.to_numeric(df['estimated_time_to_construct'], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else None)


 


def move_description_to_upgrade(df):
    """Moves description to upgrade if type_of_upgrade is 'PTO' and upgrade is empty."""
    
    # Ensure columns are strings and replace NaNs with empty strings for processing
    df['upgrade'] = df['upgrade'].astype(str).replace("nan", "").fillna("")
    df['description'] = df['description'].astype(str).replace("nan", "").fillna("")

    # Debug: Print before update
    #print("Before update (only PTO rows):")
    #print(df[df['type_of_upgrade'] == 'PTO'][['type_of_upgrade', 'upgrade', 'description']])

    # Apply row-wise transformation
    def move_if_empty(row):
        if row['type_of_upgrade'] == 'PTO' and row['upgrade'].strip() == "" and row['description'].strip() != "":
            row['upgrade'] = row['description']  # Move description to upgrade
            row['description'] = None # Clear description
        return row

    df = df.apply(move_if_empty, axis=1)

    # Debug: Print after update
    #print("\nAfter update (only PTO rows):")
    #print(df[df['type_of_upgrade'] == 'PTO'][['type_of_upgrade', 'upgrade', 'description']])

    return df

# Apply function
#df = move_description_to_upgrade(df)


# Your list of upgrade phrases
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]

def simple_move_upgrades(df, desc_col='description', upg_col='upgrade'):
    # Ensure the upgrade column exists
    df[upg_col] = df.get(upg_col, None)

    # Iterate row by row
    for i, desc in df[desc_col].fillna("").items():
        for ph in upgrade_phrases:
            if ph in desc:
                # Set the upgrade column to the found phrase
                df.at[i, upg_col] = ph
                # Remove the phrase from the description
                df.at[i, desc_col] = desc.replace(ph, "").strip()
                break  # stop after the first match

    return df


df = simple_move_upgrades(df)


df = df[df['upgrade'].fillna('').astype(str).str.strip() != '']
######################################################################################################################################
########################################
#STEP 5: REMOVING TOTAL ROW, AS THE PDFS GIVE TOTAL NETWORK COST RATHER THAN BY RNU, LDNU AS WE HAD BEFORE
# Remove rows where upgrade is "Total" (case-insensitive)

df['tot'] = df.apply(
    lambda row: 'yes' if (
        (pd.notna(row.get('upgrade')) and 'Total' in str(row['upgrade']))  
    ) else 'no',
    axis=1
) 

# Now extract ONLY "Total" rows with a foolproof match
total_rows_df = df[df['tot'] == 'yes']

total_rows_df = total_rows_df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()
total_rows_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total_network_addendum.csv', index=False) 
df = df[df['upgrade'].str.strip().str.lower() != 'total']
df.drop('tot', axis=1, inplace= True)




######################################################################################################################################
########################################
# STEP 6: Move upgrade phrases like IRNU from upgrade column to a new column upgrade_classificatio and also replace type_of_upgrade with LDNU, CANU



# Define the list of phrases for upgrade classification
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]




def classify_and_fill_upgrades(df):
    """Moves upgrade phrases to upgrade_classification, forward-fills (including type_of_upgrade), and removes redundant rows."""

    # 1) Normalize
    df['upgrade'] = df['upgrade'].astype(str).fillna("")
    df['upgrade_classification'] = None

    # 2) Mark the phrase‐rows
    df.loc[df['upgrade'].isin(upgrade_phrases), 'upgrade_classification'] = df['upgrade']

    # 3) Single pass per‐q_id ffill
    def ffill_qid(group):
        current = None
        for idx in group.index:
            up = group.at[idx, 'upgrade'].strip()

            if up in upgrade_phrases:
                current = up
            elif up.lower() == "total":
                current = None

            # fill classification
            group.at[idx, 'upgrade_classification'] = current

            # forward‐fill type_of_upgrade whenever it’s blank
            if not group.at[idx, 'type_of_upgrade'] or pd.isna(group.at[idx, 'type_of_upgrade']):
                group.at[idx, 'type_of_upgrade'] = current or ""

        return group

    df = df.groupby('q_id', group_keys=False).apply(ffill_qid)

    # 4) drop the original phrase‐marker rows
    df = df[~df['upgrade'].isin(upgrade_phrases)]

    # 5) finally, coerce the “master” classification back into type_of_upgrade
    df.loc[df['upgrade_classification'] == 'LDNU',                     'type_of_upgrade'] = 'LDNU'
    df.loc[df['upgrade_classification'].isin(['CANU-GR','CANU']),      'type_of_upgrade'] = 'CANU'
    df.loc[df['upgrade_classification'] == 'PNU',                      'type_of_upgrade'] = 'PNU'
    df.loc[df['upgrade_classification'].isin(['IRNU','GRNU','IRNU-A']), 'type_of_upgrade'] = 'RNU'

    return df

# Usage
df = classify_and_fill_upgrades(df)



df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/ph2_cluster_14_style_Q_addendum.csv', index=False)  



mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "IRNU": 'RNU',
'Total PTO_IF': 'PTO_IF',
 'Total RNU': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
 "Total IRNU": 'RNU',
}


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df     


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )



######################################################################################################################################
########################################
#STEP 7: Stable sort type of upgrade

def stable_sort_by_type_of_upgrade(df):
    """Performs a stable sort within each q_id to order type_of_upgrade while preserving row order in other columns."""
    
    # Define the custom sorting order for type_of_upgrade
    type_order = {"PTO_IF": 1, "RNU": 2, "LDNU": 3, "PNU": 4, "ADNU": 5}

    # Assign a numerical sorting key; use a high number if type_of_upgrade is missing
    df['sort_key'] = df['type_of_upgrade'].map(lambda x: type_order.get(x, 99))

    # Perform a stable sort by q_id first, then by type_of_upgrade using the custom order
    df = df.sort_values(by=['q_id', 'sort_key'], kind='stable').drop(columns=['sort_key'])

    return df

# Apply stable sorting
  


df = df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()


df= reorder_columns(df)

df = stable_sort_by_type_of_upgrade(df)  
df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/03_raw/cluster_14_style_Q_addendum.csv', index=False)
######################################################################################################################################
########################################
# STEP 8: Remove $ signs and convert to numeric
 


def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries
    


def clean_currency(value):
    """
    Cleans a string by removing:
      - Dollar signs ($)
      - Asterisks (*)
      - Any parenthesized content "(...)" (Notes or otherwise)
      - Commas
    Then converts to a numeric float, returning pd.NA on failure.
    """
    if isinstance(value, str):
        # 1) Remove $ and *
        value = value.replace('$', '').replace('*', '')
        # 2) Remove anything in parentheses, e.g. "(Note 6 and 7)"
        value = re.sub(r'\([^)]*\)', '', value)
        # 3) Remove commas and trim spaces
        value = value.replace(',', '').strip()
    # 4) Convert to numeric, coercing invalid to NaN
    return pd.to_numeric(value, errors='coerce')    
    




# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000_escalated', 'total_estimated_cost_x_1000', 'adnu_cost_rate_x_1000', 'adnu_cost_rate_x_1000_escalated']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)






######################################################################################################################################
########################################
# STEP 9: Create Total rows


df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)


  

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000', 'escalated_cost_x_1000', 'total_estimated_cost_x_1000_escalated', 'adnu_cost_rate_x_1000_escalated']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    print(f"\nProcessing q_id: {q_id}")  # Debug print
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

        # Debug: Print current group
        print(f"\nChecking Upgrade: {upgrade}, Total Rows Present?:", 
              ((group['type_of_upgrade'] == f"Total {upgrade}") & (group['item'] == 'no')).any())

        # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = ((group['type_of_upgrade'] == f"Total {upgrade}") & (group['item'] == 'no')).any()
        
        if total_exists:
            print(f"Skipping Total row for {upgrade} (already exists).")
            continue
        
        total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings
        total_row['q_id'] = q_id
        total_row['type_of_upgrade'] = f"Total {upgrade}"
        total_row['item'] = 'no'

        # Populate specified columns from the existing row
        first_row = rows.iloc[0]
        for col in columns_to_populate:
            if col in df.columns:
                total_row[col] = first_row[col]

        # Sum the numeric columns
        for col in columns_to_sum:
            if col in rows.columns:
                total_row[col] = rows[col].sum()
            else:
                total_row[col] = 0  # Default to 0 if column is missing

        print(f"Creating Total row for {upgrade}")  # Debug print
        new_rows.append(total_row)

# Convert list to DataFrame and append
if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    print("\nNew Total Rows Created:\n", total_rows_df)  # Debug print
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


df = stable_sort_by_type_of_upgrade(df)

 


#: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]




#  Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )
    

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )



def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


   


df= reorder_columns(df)


# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_itemized_addendums.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    totals_df = df[df['item'] == 'no'].drop(columns=totals_columns, errors='ignore')
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total_addendums.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_2_cluster_14_style_Q_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_2_cluster_14_style_Q_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())


#df.to_csv('Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 14/03_raw/rawdata_cluster14_style_Q.csv')


Processing q_id: 2177

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Checking Upgrade: OPNU, Total Rows Present?: False
Creating Total row for OPNU

Processing q_id: 2181

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: OPNU, Total Rows Present?: False
Creating Total row for OPNU

Processing q_id: 2188

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

New Total Rows Created:
    q_id  cluster req_deliverability   latitude   longitude  capacity  \
0  2177       14               Full  32.759906 -116.287100       NaN   
1  2177       14               Full

  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df = df.groupby('q_id', group_keys=False).apply(ffill_qid)


# Merge addendum and orginal 

In [96]:
import pandas as pd
import numpy as np

def load_data(file_path, char_columns):
    """
    Load a CSV file and ensure specific columns are treated as character, others as numeric.
    """
 # Get columns available in the dataset
    available_columns = pd.read_csv(file_path, nrows=0).columns
    
    # Restrict to char_columns that are present in the dataset
    char_columns_in_dataset = [col for col in char_columns if col in available_columns]
    
    # Load the dataset, treating char_columns_in_dataset as strings
    df = pd.read_csv(
        file_path,
        dtype={col: str for col in char_columns_in_dataset},
        na_values=[],  # Disable automatic NaN interpretation
        keep_default_na=False  # Prevent treating "None" as NaN
    )

    
    
    
    # Convert all other columns to numeric
    #for col in df.columns:
    #    if col not in char_columns:
    #        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
    
    return df

def save_data(df, file_path, char_columns):
    """
    Save a dataframe to a CSV file, ensuring specific columns are treated as character.
    """
    for col in char_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
    df.to_csv(file_path, index=False)

def merge_with_addendums(itemized, itemized_addendums, total, total_addendums):
    # Add an 'original' column to the datasets
    itemized['original'] = "yes"
    total['original'] = "yes"
    
    # Preserve the original row order
    itemized['row_order'] = pd.to_numeric(itemized.index, errors="coerce")
    total['row_order'] = pd.to_numeric(total.index, errors="coerce")
    
    # Ensure q_id is numeric for comparison
    itemized['q_id'] = pd.to_numeric(itemized['q_id'], errors="coerce")
    itemized_addendums['q_id'] = pd.to_numeric(itemized_addendums['q_id'], errors="coerce")
    total['q_id'] = pd.to_numeric(total['q_id'], errors="coerce")
    total_addendums['q_id'] = pd.to_numeric(total_addendums['q_id'], errors="coerce")
    
    # Columns for conditional replacement
    conditional_columns = ["req_deliverability", "latitude", "longitude", "capacity", "point_of_interconnection"]
    
    # --- Process itemized data (unchanged) ---
    updated_itemized_rows = []
    for q_id in itemized_addendums['q_id'].unique():
        for upgrade_type in itemized_addendums['type_of_upgrade'].unique():
            addendum_rows = itemized_addendums[
                (itemized_addendums['q_id'] == q_id) &
                (itemized_addendums['type_of_upgrade'] == upgrade_type)
            ]
            if not addendum_rows.empty:
                mask = (itemized['q_id'] == q_id) & (itemized['type_of_upgrade'] == upgrade_type)
                original_rows = itemized[mask]
                print(f"Processing itemized: q_id={q_id}, type_of_upgrade={upgrade_type}")
                print(f"Length of addendum_rows: {len(addendum_rows)}")
                print(f"Length of original_rows: {len(original_rows)}")
                # For specified columns, replace if addendum values are non-empty
                for col in conditional_columns:
                    if col in addendum_rows.columns and col in original_rows.columns:
                        addendum_rows[col] = addendum_rows[col].replace("", pd.NA)
                        addendum_rows[col] = addendum_rows[col].combine_first(original_rows[col].reset_index(drop=True))
                        addendum_rows[col] = addendum_rows[col].fillna("")
                # Align lengths
                original_rows = original_rows.reset_index(drop=True)
                addendum_rows = addendum_rows.reset_index(drop=True)
                if len(addendum_rows) > len(original_rows):
                    extra_rows = pd.DataFrame({col: pd.NA for col in original_rows.columns},
                                              index=range(len(addendum_rows) - len(original_rows)))
                    original_rows = pd.concat([original_rows, extra_rows], ignore_index=True)
                elif len(addendum_rows) < len(original_rows):
                    original_rows = original_rows.iloc[:len(addendum_rows)].reset_index(drop=True)
                itemized.loc[mask, 'original'] = "no"
                updated_itemized_rows.append(
                    addendum_rows.assign(original="no", row_order=original_rows['row_order'].values[:len(addendum_rows)])
                )
                itemized = itemized[~mask]
    if updated_itemized_rows:
        updated_itemized = pd.concat([itemized] + updated_itemized_rows, ignore_index=True)
    else:
        updated_itemized = itemized.copy()
    updated_itemized["row_order"] = pd.to_numeric(updated_itemized["row_order"], errors="coerce").fillna(-1).astype(int)
    updated_itemized = updated_itemized.sort_values(by="row_order").drop(columns=["row_order"]).reset_index(drop=True)
    
    # --- Process total data ---
    updated_total_rows = []
    for q_id in total_addendums['q_id'].unique():
        for upgrade_type in total_addendums['type_of_upgrade'].unique():
            addendum_row = total_addendums[
                (total_addendums['q_id'] == q_id) &
                (total_addendums['type_of_upgrade'] == upgrade_type)
            ]
            if not addendum_row.empty:
                mask = (total['q_id'] == q_id) & (total['type_of_upgrade'] == upgrade_type)
                original_row = total[mask]
                print(f"Processing total: q_id={q_id}, type_of_upgrade={upgrade_type}")
                # If no matching original row exists, create a default row_order column
                if original_row.empty:
                    original_row = pd.DataFrame({'row_order': [pd.NA] * len(addendum_row)}, index=addendum_row.index)
                else:
                    original_row = original_row.reset_index(drop=True)
                addendum_row = addendum_row.reset_index(drop=True)
                if len(addendum_row) > len(original_row):
                    extra_rows = pd.DataFrame({col: pd.NA for col in original_row.columns},
                                              index=range(len(addendum_row) - len(original_row)))
                    original_row = pd.concat([original_row, extra_rows], ignore_index=True)
                elif len(addendum_row) < len(original_row):
                    original_row = original_row.iloc[:len(addendum_row)].reset_index(drop=True)
                for col in conditional_columns:
                    if col in addendum_row.columns and col in original_row.columns:
                        addendum_row[col] = addendum_row[col].replace("", pd.NA)
                        addendum_row[col] = addendum_row[col].combine_first(original_row[col].reset_index(drop=True))
                        addendum_row[col] = addendum_row[col].fillna("")
                total.loc[mask, 'original'] = "no"
                updated_total_rows.append(
                    addendum_row.assign(original="no", row_order=original_row['row_order'].values[:len(addendum_row)])
                )
                total = total[~mask]
    if updated_total_rows:
        updated_total = pd.concat([total] + updated_total_rows, ignore_index=True)
    else:
        updated_total = total.copy()
    updated_total["row_order"] = pd.to_numeric(updated_total["row_order"], errors="coerce").fillna(-1).astype(int)
    updated_total = updated_total.sort_values(by="row_order").drop(columns=["row_order"]).reset_index(drop=True)
    
    # Fill missing columns with zeros in the updated datasets
    for col in set(itemized.columns) - set(updated_itemized.columns):
        updated_itemized[col] = 0
    for col in set(total.columns) - set(updated_total.columns):
        updated_total[col] = 0

    # Move the 'original' column to the last position
    updated_itemized = updated_itemized[[col for col in updated_itemized.columns if col != 'original'] + ['original']]
    updated_total = updated_total[[col for col in updated_total.columns if col != 'original'] + ['original']]
    
    if "row_order" in updated_itemized.columns:
        updated_itemized = updated_itemized.drop(columns=["row_order"]).reset_index(drop=True)
    if "row_order" in updated_total.columns:
        updated_total = updated_total.drop(columns=["row_order"]).reset_index(drop=True)
    
    return updated_itemized, updated_total


# Define the character columns
char_columns = [
    "req_deliverability", "point_of_interconnection", "type_of_upgrade",
    "upgrade", "description", "estimated_time_to_construct", "original", "item"
]




itemized = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_itemized.csv", char_columns)
itemized_addendums = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_itemized_addendums.csv", char_columns)
total = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total.csv", char_columns)
total_addendums = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_style_Q_total_addendums.csv", char_columns)


updated_itemized, updated_total = merge_with_addendums(itemized, itemized_addendums, total, total_addendums)

# Drop the specified columns from the updated datasets
columns_to_drop = [ "upgrade_classification","estimated", "caiso_queue", "project_type", "dependent_system_upgrade"]

# For the itemized dataset
updated_itemized = updated_itemized.drop(columns=[col for col in columns_to_drop if col in updated_itemized.columns], errors='ignore')

# For the total dataset
updated_total = updated_total.drop(columns=[col for col in columns_to_drop if col in updated_total.columns], errors='ignore')



# List of columns to process with ffill and bfill
columns_to_fill = ["point_of_interconnection", "latitude", "longitude", "req_deliverability", "capacity"]

# Replace empty strings with NaN for the specified columns
for col in columns_to_fill:
    updated_itemized[col] = updated_itemized[col].replace('', np.nan)
    updated_total[col] = updated_total[col].replace('', np.nan)

# Sort by q_id while maintaining other column order (stable sorting)
updated_itemized = updated_itemized.sort_values(by=["q_id"], kind="stable").reset_index(drop=True)
updated_total = updated_total.sort_values(by=["q_id"], kind="stable").reset_index(drop=True)

# Apply forward-fill and backward-fill for the specified columns within each q_id group
for col in columns_to_fill:
    updated_itemized[col] = (
        updated_itemized.groupby("q_id")[col]
        .apply(lambda group: group.ffill().bfill())
        .reset_index(drop=True)
    )
    updated_total[col] = (
        updated_total.groupby("q_id")[col]
        .apply(lambda group: group.ffill().bfill())
        .reset_index(drop=True)
    )

# Replace NaN back with empty strings for consistency
for col in columns_to_fill:
    updated_itemized[col] = updated_itemized[col].replace(np.nan, '')
    updated_total[col] = updated_total[col].replace(np.nan, '')

 
 







# Save the updated datasets
save_data(updated_itemized, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/01_clean/costs_phase_2_cluster_14_style_Q_itemized_updated.csv", char_columns)
save_data(updated_total, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/01_clean/costs_phase_2_cluster_14_style_Q_total_updated.csv", char_columns)



# Save the results
save_data(updated_itemized, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/01_clean/costs_phase_2_cluster_14_style_Q_itemized_updated.csv", char_columns)
save_data(updated_total, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/01_clean/costs_phase_2_cluster_14_style_Q_total_updated.csv", char_columns)


Processing itemized: q_id=2177, type_of_upgrade=PTO_IF
Length of addendum_rows: 1
Length of original_rows: 1
Processing itemized: q_id=2177, type_of_upgrade=RNU
Length of addendum_rows: 3
Length of original_rows: 3
Processing itemized: q_id=2177, type_of_upgrade=OPNU
Length of addendum_rows: 3
Length of original_rows: 0
Processing itemized: q_id=2181, type_of_upgrade=PTO_IF
Length of addendum_rows: 1
Length of original_rows: 1
Processing itemized: q_id=2181, type_of_upgrade=RNU
Length of addendum_rows: 3
Length of original_rows: 3
Processing itemized: q_id=2181, type_of_upgrade=OPNU
Length of addendum_rows: 2
Length of original_rows: 2
Processing itemized: q_id=2181, type_of_upgrade=LDNU
Length of addendum_rows: 1
Length of original_rows: 1
Processing itemized: q_id=2188, type_of_upgrade=PTO_IF
Length of addendum_rows: 1
Length of original_rows: 1
Processing itemized: q_id=2188, type_of_upgrade=RNU
Length of addendum_rows: 1
Length of original_rows: 1
Processing total: q_id=2177, type_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].replace("", pd.NA)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].combine_first(original_rows[col].reset_index(drop=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].fillna(""