In [88]:
import os
import re
import traceback
import pdfplumber
import PyPDF2
import pandas as pd

# ------------------- Configuration -------------------
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/scraping_cluster9_style_J_log.txt"
PROJECT_RANGE = range(1223, 1348)  # Original range #(1831, 2193)

# Read the CSV file containing processed projects (with q_id column)
processed_csv_path = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/all_clusters/costs_phase_1_all_clusters_total1.csv"  # UPDATE THIS PATH
processed_df = pd.read_csv(processed_csv_path)
# Convert q_id values to numeric then to int for filtering
processed_q_ids = pd.to_numeric(processed_df['q_id'], errors='coerce').dropna().astype(int).unique()
projects_to_process = sorted([q_id for q_id in PROJECT_RANGE if q_id not in processed_q_ids])

# ------------------- Global Tracking Variables -------------------
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
style_n_pdfs = []  # Not used in this version but kept for consistency

total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0

# ------------------- Helper Function for Logging -------------------
def log_msg(msg, log_file):
    """Prints a message to both the log file and console."""
    print(msg, file=log_file)
    print(msg)

# ------------------- Other Helper Functions -------------------
def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    elif value is None:
        return ""
    else:
        return str(value).replace('\n', ' ').strip()

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "Other Potential Network Upgrade",
        "Area Delivery Network Upgrades",
        "Conditionally Assigned Network Upgrades",
        "Local Off-Peak Network Upgrade",
        "ADNU",
        "LDNU",
        "RNU"
    ]
    for phrase in phrases:
        if re.search(rf"\b{re.escape(phrase)}\b(?=\d|\W|$)", title, re.IGNORECASE):
            return phrase
    return title  # Fallback if no specific phrase is found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection"
    ]
    existing_desired = [col for col in desired_order if col in df.columns]
    remaining = [col for col in df.columns if col not in existing_desired]
    new_order = existing_desired + remaining
    return df[new_order]

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        log_msg(f"Found GPS coordinates: {gps_coords.groups()}", log_file)
        return gps_coords.groups()
    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        log_msg(f"Found project coordinates: {project_coords.groups()}", log_file)
        return project_coords.groups()
    gps_coords_directional = re.search(r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"
        longitude = lon if "E" in text.upper() else f"-{lon}"
        log_msg(f"Found directional GPS coordinates: {(latitude, longitude)}", log_file)
        return (latitude, longitude)
    log_msg("GPS coordinates not found.", log_file)
    return (None, None)

# ------------------- Appendix PDF Check -------------------
def is_appendix_pdf(pdf_path):
    """Returns True if the first page of the PDF contains 'Appendix A'."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            first_page_text = pdf.pages[0].extract_text() or ""
            return "Appendix A" in first_page_text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

# ------------------- Base Data & Table 1 Extraction (Appendix A Only) -------------------
def extract_table1(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    This function is intended to run only on the Appendix A PDF.
    Now it searches for pages containing "Table A.2", "Table B.2" or "Table C.2"
    and within the tables it looks for either "Point of Interconnection" or "POI".
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None
    # Modified to match either "Point of Interconnection" or "POI"
    poi_pattern = re.compile(r"(Point\s+of\s+Interconnection|POI)", re.IGNORECASE)
    table_settings_list = [
        {"horizontal_strategy": "text", "vertical_strategy": "lines", "snap_tolerance": 1},
        {"horizontal_strategy": "lines", "vertical_strategy": "lines", "snap_tolerance": 2}
    ]
    try:
        with pdfplumber.open(pdf_path) as pdf:
            table1_pages = []
            # Modified regex: look for "Table A.2", "Table B.2" or "Table C.2"
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*[ABC]\.1\b", text, re.IGNORECASE):
                    table1_pages.append(i)
            if not table1_pages:
                print("No Table 1 found in the PDF.", file=log_file)
                return None
            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 2
            print(f"Table 1 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)
            extraction_successful = False
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 1...", file=log_file)
                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"Attempt {attempt} with settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1}", file=log_file)
                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty.", file=log_file)
                            continue
                        for row_index, row in enumerate(tab, start=1):
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    poi_col_index = cell_index  # 1-based index
                                    adjacent_col_index = poi_col_index + 1
                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:
                                            point_of_interconnection = poi_value
                                            print(f"Found POI: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break
                                        else:
                                            print(f"POI label found but adjacent value empty (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            poi_value_parts = []
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)
                                            for scan_row_index in range(start_scan, end_scan):
                                                if scan_row_index == current_row_idx:
                                                    continue
                                                scan_row = tab[scan_row_index]
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                            if poi_value_parts:
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"Concatenated POI: '{point_of_interconnection}'", file=log_file)
                                                extraction_successful = True
                                                break
                                    else:
                                        print(f"POI label found but no adjacent column (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                            if extraction_successful:
                                break
                        if extraction_successful:
                            break
                    if extraction_successful:
                        break
                if extraction_successful:
                    break
    except Exception as e:
        print(f"Error processing Table 1 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None
    if not extraction_successful:
        if point_of_interconnection is not None and point_of_interconnection != "":
            print("POI label found but no value.", file=log_file)
            return "Value Missing"
        else:
            print("POI not found in Table 1.", file=log_file)
            return None
    return point_of_interconnection


def extract_base_data(pdf_path, project_id, log_file):
    """
    Extracts base data from the Appendix A PDF.
    (This function is meant to run only on a PDF verified as an Appendix A PDF.)
    """
    if not is_appendix_pdf(pdf_path):
        log_msg(f"Skipping base extraction because {pdf_path} is not an Appendix A PDF.", log_file)
        return pd.DataFrame()
    log_msg("Extracting base data from Appendix A PDF...", log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        text = clean_string_cell(text)
        #queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        #queue_id = queue_id.group(1) if queue_id else str(project_id)
        queue_id = str(project_id)
        log_msg(f"Extracted Queue ID: {queue_id}", log_file)
        #clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        #if '14' in clusters:
        #    cluster_number = '14'
        #elif clusters:
        #    cluster_number = max(clusters, key=lambda x: int(x))
        #else:
        #   cluster_number = '14'
        clusters = re.findall(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = '9'    
        log_msg(f"Extracted Cluster Number: {cluster_number}", log_file)
        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        log_msg(f"Extracted Deliverability Status: {deliverability_status}", log_file)
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        log_msg(f"Extracted Capacity: {capacity}", log_file)
        poi_value = extract_table1(pdf_path, log_file)
        latitude, longitude = search_gps_coordinates(text, log_file)
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [poi_value]
        }
        log_msg("Base data extracted:", log_file)
        log_msg(str(base_data), log_file)
        return pd.DataFrame(base_data)
    except Exception as e:
        log_msg(f"Error extracting base data from {pdf_path}: {e}", log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

# ------------------- Attachment 2 Processing & Merging -------------------
# compile once at module scope for a tiny speed boost
_ATTACHMENT2_PATTERN = re.compile(r'Attachment\s*\W*\s*2', re.IGNORECASE)

def is_attachment2_pdf(pdf_path):
    """
    Returns True if the first page of the PDF contains something like
    'Attachment 2', 'Attachment#2', 'Attachment - 2', 'attachment@2', etc.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            text = pdf.pages[0].extract_text() or ""
            return bool(_ATTACHMENT2_PATTERN.search(text))
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False

_ADDENDUM_PATTERN = re.compile(r'(Addendum|Revision)\s*\W*\s*\d+', re.IGNORECASE)


def is_addendum_pdf(pdf_path):
    """
    Returns True if the first page of the PDF contains 'Addendum' or 'Revision', optionally
    followed by a separator (colon, -, @, or #) and a number (e.g., "Addendum #1", "Addendum: 2", or "Revision-1").
    Uses regex matching for robustness.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if not pdf.pages:
                return False
            text = pdf.pages[0].extract_text() or ""
            return bool(_ADDENDUM_PATTERN.search(text))
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False
    
    




def make_unique_headers(headers):
    """
    Appends a suffix to duplicate headers to make them unique.
    """
    seen = {}
    unique_headers = []
    for header in headers:
        if header in seen:
            seen[header] += 1
            unique_headers.append(f"{header}_{seen[header]}")
        else:
            seen[header] = 1
            unique_headers.append(header)
    return unique_headers

 

def extract_first_table(pdf_path, log_file):
    """
    Extracts the *first* non-empty table from the entire PDF.
    Then, on the *same page* where that first table was found, searches for:
      - A table that contains "Other Potential" (as before)
      - A table that contains "Area Delivery Network Upgrades"
    If the ADN table is not found on the same page, it checks the next page.
    Finally, it merges any extra tables found with the first table row-wise.
    """
    import pdfplumber
    import pandas as pd
    
    first_table_df = pd.DataFrame()
    found_page_index = None

    try:
        # 1) Find the *first* non-empty table in the PDF
        with pdfplumber.open(pdf_path) as pdf:
            for page_index, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table in tables:
                    if table and any(any(cell and cell.strip() for cell in row) for row in table):
                        df = pd.DataFrame(table)
                        if not df.empty:
                            headers = df.iloc[0].tolist()
                            headers = make_unique_headers(headers)
                            df.columns = headers
                            df = df[1:].reset_index(drop=True)
                            # Remove duplicate column names (safeguard)
                            df = df.loc[:, ~df.columns.duplicated()]
                            first_table_df = df
                            found_page_index = page_index
                            break  # Stop looking at more tables on this page
                if found_page_index is not None:
                    break  # Stop after the first table is found

            # If no table was found at all, just return empty DataFrame
            if first_table_df.empty:
                return first_table_df

            # 2) On that page (and potentially the next page) look for extra tables:
            #    one containing "Other Potential" and one containing "Area Delivery Network Upgrades"
            other_table_df = pd.DataFrame()
            adn_table_df = pd.DataFrame()
            if found_page_index is not None:
                page = pdf.pages[found_page_index]
                tables = page.extract_tables()
                for table in tables:
                    if table and any(any(cell and cell.strip() for cell in row) for row in table):
                        # Check for "Other Potential" and "Area Delivery Network Upgrades"
                        found_other = any(
                            cell and "other potential" in cell.lower() 
                            for row in table for cell in row
                        )
                        found_adn = any(
                            cell and "area delivery network upgrades" in cell.lower() 
                            for row in table for cell in row
                        )
                        if found_other and other_table_df.empty:
                            df_other = pd.DataFrame(table)
                            if not df_other.empty:
                                headers = df_other.iloc[0].tolist()
                                headers = make_unique_headers(headers)
                                df_other.columns = headers
                                df_other = df_other[1:].reset_index(drop=True)
                                df_other = df_other.loc[:, ~df_other.columns.duplicated()]
                                other_table_df = df_other
                        if found_adn and adn_table_df.empty:
                            df_adn = pd.DataFrame(table)
                            if not df_adn.empty:
                                headers = df_adn.iloc[0].tolist()
                                headers = make_unique_headers(headers)
                                df_adn.columns = headers
                                df_adn = df_adn[1:].reset_index(drop=True)
                                df_adn = df_adn.loc[:, ~df_adn.columns.duplicated()]
                                adn_table_df = df_adn
                        if not other_table_df.empty and not adn_table_df.empty:
                            break

                # If ADN table not found on the same page, check the next page (if available)
                if adn_table_df.empty and found_page_index + 1 < len(pdf.pages):
                    next_page = pdf.pages[found_page_index + 1]
                    tables = next_page.extract_tables()
                    for table in tables:
                        if table and any(any(cell and cell.strip() for cell in row) for row in table):
                            found_adn = any(
                                cell and "area delivery network upgrades" in cell.lower()
                                for row in table for cell in row
                            )
                            if found_adn:
                                df_adn = pd.DataFrame(table)
                                if not df_adn.empty:
                                    headers = df_adn.iloc[0].tolist()
                                    headers = make_unique_headers(headers)
                                    df_adn.columns = headers
                                    df_adn = df_adn[1:].reset_index(drop=True)
                                    df_adn = df_adn.loc[:, ~df_adn.columns.duplicated()]
                                    adn_table_df = df_adn
                                    break

            # 3) Merge the tables if any extra tables were found
            # 3) Vertically stack each table — preserving separate prefixes — so values never share a row
            dfs = [first_table_df]

            if not other_table_df.empty:
                dfs.append(other_table_df.add_prefix("OtherPotential_"))

            if not adn_table_df.empty:
                dfs.append(adn_table_df.add_prefix("ADN_"))

            merged_df = pd.concat(dfs, ignore_index=True, sort=False)
            return merged_df


    except Exception as e:
        print(f"Error extracting table from {pdf_path}: {e}", file=log_file)
        return pd.DataFrame()



def update_base_data(existing_df, new_df):
    """
    For each column in existing_df (assumed to be a single-row DataFrame),
    if the value is missing (empty string, "None", or NA), update it with the corresponding
    value from new_df (if provided and not missing).
    Returns the updated DataFrame.
    """
    for col in existing_df.columns:
        existing_val = existing_df.at[0, col]
        # Use pd.isna() to check for NA values.
        if pd.isna(existing_val) or existing_val == "" or existing_val == "None":
            if col in new_df.columns:
                new_val = new_df.at[0, col]
                if not (pd.isna(new_val) or new_val == "" or new_val == "None"):
                    existing_df.at[0, col] = new_val
    return existing_df



def process_attachment2_for_project(project_id, log_file):
    """
    For the given project:
      1. Identify all original (non‑revision) Appendix A PDFs and any addendum (revision) Appendix A PDFs 
         in the project's "02_phase_1_study" folder.
      2. If an original exists, use the first original for base data extraction.
         Otherwise, if only an addendum exists, use that for base data extraction.
      3. If any base data column is missing, iterate through additional PDFs (if available) to update the missing columns.
      4. Then proceed to scrape Attachment 2 PDFs and merge the base data (duplicated for each row) with
         the extracted table (using the first row as header).
         (Attachment 2 PDFs will be routed as before based solely on whether each PDF is flagged as addendum.)
    """
    global total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped
    project_folder = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
    if not os.path.exists(project_folder):
        log_msg(f"Project folder not found: {project_folder}", log_file)
        return pd.DataFrame(), pd.DataFrame()
    
    # Gather all original (non‑revision) Appendix A PDFs and any addendum PDFs
    original_appendix_pdfs = []
    addendum_appendix_pdf = None
    for f in os.listdir(project_folder):
        if not f.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(project_folder, f)
        if is_appendix_pdf(pdf_path):
            if is_addendum_pdf(pdf_path):
                if not addendum_appendix_pdf:
                    addendum_appendix_pdf = f
            else:
                original_appendix_pdfs.append(f)
    
    # Determine which file to use for base data extraction.
    if not original_appendix_pdfs:
        if addendum_appendix_pdf:
            log_msg(f"No original Appendix A PDF found for project {project_id}. Using addendum PDF for base data extraction.", log_file)
            base_pdf = addendum_appendix_pdf
            base_data_df = extract_base_data(os.path.join(project_folder, base_pdf), project_id, log_file)
        else:
            log_msg(f"No Appendix A PDF (original or addendum) found for project {project_id}.", log_file)
            return pd.DataFrame(), pd.DataFrame()
    else:
        base_pdf = original_appendix_pdfs[0]
        log_msg(f"Scraped base data from original Appendix A PDF: {os.path.join(project_folder, base_pdf)}", log_file)
        base_data_df = extract_base_data(os.path.join(project_folder, base_pdf), project_id, log_file)

    # Process addendum base data similarly.
    if addendum_appendix_pdf:
        addendum_base_pdf_path = os.path.join(project_folder, addendum_appendix_pdf)
        log_msg(f"Scraped base data from addendum Appendix A PDF: {addendum_base_pdf_path}", log_file)
        addendum_base_data_df = extract_base_data(addendum_base_pdf_path, project_id, log_file)
        if addendum_base_data_df.empty:
            addendum_base_data_df = base_data_df.copy()
    else:
        addendum_base_data_df = base_data_df.copy()

    # Check for missing values in base_data_df (assumed single-row DataFrame)
    missing = [col for col in base_data_df.columns if pd.isna(base_data_df.at[0, col]) or base_data_df.at[0, col] in ["", "None"]]
    if missing:
        log_msg(f"Missing base data columns in first Appendix A PDF: {missing}", log_file)
        # Iterate over additional original PDFs to update missing values.
        for other_pdf in original_appendix_pdfs[1:]:
            other_pdf_path = os.path.join(project_folder, other_pdf)
            log_msg(f"Attempting to update base data from: {other_pdf_path}", log_file)
            new_base_df = extract_base_data(other_pdf_path, project_id, log_file)
            base_data_df = update_base_data(base_data_df, new_base_df)
            missing = [col for col in base_data_df.columns if pd.isna(base_data_df.at[0, col]) or base_data_df.at[0, col] in ["", "None"]]
            if not missing:
                break
        if missing:
            log_msg(f"After update, still missing: {missing}", log_file)
        else:
            log_msg("Successfully updated all missing base data.", log_file)

    # Now process Attachment 2 PDFs.
    attachment_data_list = []      # Regular Attachment 2 PDFs
    attachment_addendum_list = []  # Addendum Attachment 2 PDFs

    for f in os.listdir(project_folder):
        if not f.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(project_folder, f)
        log_msg(f"Accessing PDF: {pdf_path}", log_file)
        total_pdfs_accessed += 1
        if is_attachment2_pdf(pdf_path):
            log_msg(f"Scraped this Attachment 2 PDF: {pdf_path}", log_file)
            table_df = extract_first_table(pdf_path, log_file)
            if table_df.empty:
                log_msg(f"--> No table found in {pdf_path}. Skipping.", log_file)
                skipped_pdfs.append(f)
                total_pdfs_skipped += 1
                continue

            # (If needed, rename common columns; if not, comment this block out)
            common_cols = set(base_data_df.columns) & set(table_df.columns)
            for col in common_cols:
                table_df.rename(columns={col: f"{col}_table"}, inplace=True)

            # Determine base data for this Attachment 2 PDF based solely on its own revision status.
            if is_addendum_pdf(pdf_path):
                log_msg(f"--> {pdf_path} flagged as addendum.", log_file)
                base_df = addendum_base_data_df
                addendum_pdfs.append(f)
            else:
                base_df = base_data_df
                original_pdfs.append(f)

            # Ensure unique column names
            base_df = base_df.loc[:, ~base_df.columns.duplicated()]
            table_df = table_df.loc[:, ~table_df.columns.duplicated()]

            # Duplicate base data for each row of the table and merge side-by-side.
            repeated_base = pd.concat([base_df] * len(table_df), ignore_index=True)
            merged_df = pd.concat([repeated_base, table_df], axis=1)
            if is_addendum_pdf(pdf_path):
                attachment_addendum_list.append(merged_df)
            else:
                attachment_data_list.append(merged_df)
            scraped_pdfs.append(f)
            total_pdfs_scraped += 1
        else:
            log_msg(f"--> {pdf_path} is not an Attachment 2 PDF. Skipping.", log_file)
            skipped_pdfs.append(f)
            total_pdfs_skipped += 1

    if not attachment_data_list and not attachment_addendum_list:
        skipped_projects.add(project_id)
    else:
        scraped_projects.add(project_id)

    project_attachment_df = pd.concat(attachment_data_list, ignore_index=True) if attachment_data_list else pd.DataFrame()
    project_attachment_addendum_df = pd.concat(attachment_addendum_list, ignore_index=True) if attachment_addendum_list else pd.DataFrame()
    return project_attachment_df, project_attachment_addendum_df



# ------------------- CSV Saving & Summary Functions -------------------
def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return
    df = df.applymap(clean_string_cell)
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")
    if 'q_id' in df.columns:
        df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

# ------------------- Main Processing Function -------------------
def process_pdfs_in_folder():
    """
    Processes all projects in ascending order (filtered via projects_to_process).
    For each project, it:
      - Checks for the project folder.
      - Processes Attachment 2 PDFs by merging base data (from Appendix A PDFs) with
        scraped table data (only the first nonempty table).
      - Aggregates results across projects.
    After processing, it saves the combined results to CSV files and prints a summary.
    """
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped
    all_attachment_data = []
    all_attachment_addendum_data = []

    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
    with open(LOG_FILE_PATH, 'w') as log_file:
        log_msg(f"Projects to process: {projects_to_process}", log_file)
        for project_id in projects_to_process:
            project_folder = os.path.join(BASE_DIRECTORY, str(project_id))
            if not os.path.exists(project_folder):
                missing_projects.add(project_id)
                log_msg(f"Project folder not found: {project_folder}", log_file)
                continue
            log_msg(f"\n--- Processing project {project_id} ---", log_file)
            proj_attach_df, proj_attach_add_df = process_attachment2_for_project(project_id, log_file)
            if not proj_attach_df.empty:
                all_attachment_data.append(proj_attach_df)
            if not proj_attach_add_df.empty:
                all_attachment_addendum_data.append(proj_attach_add_df)
        
        if all_attachment_data:
            core_originals = pd.concat(all_attachment_data, ignore_index=True)
            save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
        else:
            log_msg("\nNo Attachment 2 data processed for regular PDFs.", log_file)
        
        if all_attachment_addendum_data:
            core_addendums = pd.concat(all_attachment_addendum_data, ignore_index=True)
            save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")
        else:
            log_msg("\nNo Attachment 2 data processed for addendum PDFs.", log_file)

        total_projects_processed = len(scraped_projects) + len(skipped_projects)
        log_msg("\n=== Scraping Summary ===", log_file)
        log_msg(f"Total Projects Processed: {total_projects_processed}", log_file)
        log_msg(f"Total Projects Scraped: {len(scraped_projects)}", log_file)
        log_msg(f"Total Projects Skipped: {len(skipped_projects)}", log_file)
        log_msg(f"Total Projects Missing: {len(missing_projects)}", log_file)
        log_msg(f"Total PDFs Accessed: {total_pdfs_accessed}", log_file)
        log_msg(f"Total PDFs Scraped: {total_pdfs_scraped}", log_file)
        log_msg(f"Total PDFs Skipped: {total_pdfs_skipped}", log_file)
        log_msg("\nList of Scraped Projects: " + str(sorted(scraped_projects)), log_file)
        log_msg("\nList of Skipped Projects: " + str(sorted(skipped_projects)), log_file)
        log_msg("\nList of Missing Projects: " + str(sorted(missing_projects)), log_file)
        log_msg("\nList of Scraped PDFs: " + str(scraped_pdfs), log_file)
        log_msg("\nList of Skipped PDFs: " + str(skipped_pdfs), log_file)
        log_msg("\nList of Addendum PDFs: " + str(addendum_pdfs), log_file)
        log_msg("\nList of Original PDFs: " + str(original_pdfs), log_file)
        log_msg("\nList of Style N PDFs (Skipped due to 'Network Upgrade Type'): " + str(style_n_pdfs), log_file)
        log_msg("\nTotal Number of Style N PDFs: " + str(len(style_n_pdfs)), log_file)
        log_msg("\nNumber of Original PDFs Scraped: " + str(len([pdf for pdf in scraped_pdfs if pdf in original_pdfs])), log_file)
        log_msg("Number of Addendum PDFs Scraped: " + str(len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs])), log_file)

# ------------------- Main -------------------
def main():
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Projects to process: [1231, 1247, 1257, 1261, 1266, 1274, 1276, 1279, 1280, 1289, 1290, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347]

--- Processing project 1231 ---
No Appendix A PDF (original or addendum) found for project 1231.
Project folder not found: /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/1247

--- Processing project 1257 ---
No Appendix A PDF (original or addendum) found for project 1257.
Project folder not found: /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/1261
Project folder not found: /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/1266
Project folder not found: /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/1274
Project folder

  df = df.applymap(clean_string_cell)



Columns reordered for originals as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv

Columns reordered for addendums as per specification.

Data successfully saved to /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 9/03_raw/rawdata_cluster9_style_J_addendums.csv

=== Scraping Summary ===
Total Projects Processed: 44
Total Projects Scraped: 44
Total Projects Skipped: 0
Total Projects Missing: 17
Total PDFs Accessed: 182
Total PDFs Scraped: 64
Total PDFs Skipped: 118

List of Scraped Projects: [1295, 1296, 1297, 1299, 1300, 1301, 1302, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1338, 1339, 1341, 1344, 1345, 1347]

List of Skipped Projects: []

List of Missing Projects: [1247, 1261, 1266, 1274, 1276, 1

  df = df.applymap(clean_string_cell)


# Obtain COlumn names

In [1]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())


 


After cleaning: ['q_id', 'cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection', 'project_q1295tot789', 'unnamed_8', 'none_2', 'none_3', 'none_4', 'none_5', 'none_6', 'none_7', 'adn_project_q1295tot789', 'adn_none', 'adn_none_2', 'adn_none_3', 'adn_none_4', 'adn_none_5', 'adn_none_6', 'adn_none_7', 'element', 'interconnection_facilities_costs_x_1000_constant_dollar_2016', 'reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 'delivery_network_upgrades_costs_x_1000_constant_dollar_2016', 'distribution_upgrades_costs_x_1000_constant_dollar_2016', 'total_estimated_costs_x_1000_constant_dollar_2016', 'total_estimated_costs_x_1000_escalated_constant_dollars_2022', 'estimated_time_to_construct_months_note_345_6', 'od_dollar_escalation_duration_months_note_345_6', 'adn_element', 'adn_interconnection_facilities_costs_x_1000_constant_dollar_2016', 'adn_reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 'adn_delivery_network_upg

  df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv', dtype={'estimated_time_to_construct': str})


# Itemized and Total

In [91]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())


 


After cleaning: ['q_id', 'cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection', 'type_of_upgrade', 'upgrade', 'estimated_cost_x_1000', 'escalated_cost_x_1000', 'estimated_time_to_construct', 'item', 'max_time_to_construct']


## Originals 

In [92]:
import pandas as pd
import re
import unicodedata
import numpy as np
# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv', dtype={'estimated_time_to_construct': str})

#df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')




def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.

df.columns = clean_column_headers(df.columns)

 



#STEP 2: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]




def move_dollar_values(df, source_column, target_column):
    """
    For each row in the DataFrame, if the value in `source_column` starts with a '$',
    move that value to `target_column` and clear the value in the source column.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      source_column (str): The column to check for values starting with '$'.
      target_column (str): The column to move the values into.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure target_column exists; if not, create it filled with empty strings.
    if target_column not in df.columns:
        df[target_column] = ""
    
    # Create a boolean mask for rows where the source column starts with '$'
    mask = df[source_column].astype(str).str.startswith('$', na=False)
    
    # Move the values: assign the source values to the target column where the mask is True.
    df.loc[mask, target_column] = df.loc[mask, source_column]
    
    # Clear the source column values for those rows (set to empty string)
    df.loc[mask, source_column] = ""
    
    return df

# Move values from 'unnamed_8' to a new column 'moved_value'
#df = move_dollar_values(df, 'none_5', 'total_estimated_costs_x_1000_escalated_constant_dollars_od_year')


#df = move_dollar_values(df, 'none_3','total_estimated_costs_x_1000_constant_dollar_2020')

def remove_dollar_values_and_fill_nan(df, column):
    """
    For each row in the DataFrame, if the value in the specified column starts with '$',
    set that value to NaN. Also, replace any empty strings in that column with NaN.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      column (str): The column to check and clean.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure the column is treated as string
    df[column] = df[column].astype(str)
    
    # Set values starting with '$' to NaN
    mask = df[column].str.startswith('$', na=False)
    df.loc[mask, column] = np.nan
    
    # Replace any remaining empty strings with NaN
    df[column] = df[column].replace("", np.nan)
    
    return df

df = remove_dollar_values_and_fill_nan(df, 'unnamed_8')


def insert_phrase_if_missing(df, target_columns, phrase="Area Delivery Network Upgrades"):
    """
    For each q_id group in df, for each column in target_columns:
      - Check the first row (by current order) that is non-empty (not NaN, not blank, not 'na').
      - If that cell does NOT contain the given phrase (case-insensitive),
        insert a new row above it (within that q_id group) that sets that column to phrase,
        leaving other columns as empty strings.
    
    Returns a new DataFrame with the added rows.
    """
    # Work on a copy and reset index; create a temporary "order" column for precise control.
    df = df.copy().reset_index(drop=True)
    df["order"] = df.index.astype(float)
    
    new_rows = []
    
    # Process each q_id group
    for qid, group in df.groupby("q_id", sort=False):
        # For each target column, find the first non-empty cell.
        for col in target_columns:
            # We'll iterate over the group's index in order.
            first_idx = None
            first_val = None
            for idx in group.index:
                val = group.loc[idx, col]
                # Convert to string and strip spaces.
                # Treat NaN, empty strings, or "na"/"NA" as empty.
                val_str = "" if pd.isna(val) else str(val).strip()
                if val_str == "" or val_str.lower() == "na":
                    continue
                first_idx = idx
                first_val = val_str
                break
            # If we found a non-empty value and it doesn't contain our phrase, insert a row.
            if first_idx is not None:
                if phrase.lower() not in first_val.lower():
                    # Create a new row with the same q_id; leave all columns blank except col.
                    new_row = {c: "" for c in df.columns if c != "order"}
                    new_row["q_id"] = qid
                    new_row[col] = phrase
                    # Set its order to be a bit before the first non-empty row.
                    new_order = group.loc[first_idx, "order"] - 0.1
                    new_row["order"] = new_order
                    new_rows.append(new_row)
    
    # If any rows were added, append them and sort by q_id then order.
    if new_rows:
        new_rows_df = pd.DataFrame(new_rows)
        df = pd.concat([df, new_rows_df], ignore_index=True)
        df = df.sort_values(by=["q_id", "order"]).reset_index(drop=True)
    
    # Drop the temporary order column.
    df = df.drop(columns=["order"])
    return df


df = insert_phrase_if_missing(df, ['adn_area_delivery_network_upgrades'], phrase="Area Delivery Network Upgrades")  


def move_numeric_values(df, source_column, target_column):
    """
    For each row in the DataFrame, if the value in `source_column` is purely numeric
    (i.e. contains only an optional negative sign, digits, and an optional decimal part),
    then move that value to `target_column` and clear the source cell.
    
    Parameters:
      df (pd.DataFrame): The input DataFrame.
      source_column (str): The column to check for numeric values.
      target_column (str): The column to move the numeric values into.
      
    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure target_column exists
    if target_column not in df.columns:
        df[target_column] = np.nan

    # Define a function to check if a value is purely numeric
    def is_pure_numeric(val):
        # If the value is already numeric (int, float) and not NaN, we consider it numeric.
        if isinstance(val, (int, float)) and not pd.isna(val):
            return True
        # Otherwise, if it's a string, strip whitespace and check using regex.
        if isinstance(val, str):
            val_str = val.strip()
            # Regex explanation:
            # ^           : start of string
            # -?          : optional minus sign
            # \d+         : one or more digits
            # (\.\d+)?    : optional decimal point followed by one or more digits
            # $           : end of string
            if re.fullmatch(r'-?\d+(\.\d+)?', val_str):
                return True
        return False

    # Create a boolean mask where the source_column values are pure numeric
    mask = df[source_column].apply(is_pure_numeric)

    # Move the values: assign the source values to the target column where the mask is True.
    df.loc[mask, target_column] = df.loc[mask, source_column]
    
    # Clear the source column values for those rows (set to NaN)
    df.loc[mask, source_column] = np.nan

    return df


df = move_numeric_values(df, 'otherpotential_none_3', 'otherpotential_none_5')
df = move_numeric_values(df, 'otherpotential_none_4', 'otherpotential_none_6')
df = move_numeric_values(df, 'otherpotential_none', 'otherpotential_none_3')
df = move_numeric_values(df, 'otherpotential_none_2', 'otherpotential_none_4')




 


def merge_columns(df):

    merge_columns_dict = {


  

        "one_time_costs_b": [
            "none_2",
            'adn_none_2',
        ],


 
 

 
    
        "type_of_upgrade": [
            
             'cost_category', 
           
            "cost_category_notes_1a_to_1f",
           
            'other_potential_cost', 
            'other_potential_network_cost',
            'area_delivery_network_upgrades', 
            'otherpotential_other_potential_network_upgrades_note_1h',
            'adn_area_delivery_network_upgrades',
            'project_q1295tot789',
            'project_q1296tot834', 
             'project_q1297tot794', 
              'project_q1299tot813', 
              'project_q1300tot822', 
              'project_q1301tot825', 
              'project_q1302tot830', 'project_q1305tot786_updated_as_of_2162017', 
              'project_q1305tot786', 'project_q1306tot792', 'project_q1307tot820',
              'other_potential_network_upgrades_note_1112', 'area_delivery_network_upgrades', 'project_q1309tot805', 'project_q1310tot803', 'project_q1311tot802', 'project_q1312tot812', 'project_q1313tot811', 
              'project_q1314tot810', 'project_q1315tot829', 'project_q1316_tot838_updated_as_of_2162017', 'project_q1316_tot838', 'project_q1317tot791', 'project_q1318tot790', 'project_q1319tot793', 'project_q1320tot807', 
              'project_q1321tot814', 'project_q1322tot795', 'project_q1323tot821', 'project_q1324tot819', 'project_q1325tot818', 'project_q1326tot817', 'project_q1327tot806', 'project_q1328tot815', 'project_q1329tot823', 
              'project', 'antelope_66kv_relay_coordination_study', 'moorpark_c_66kv', 'project_q1330_tot827', 'project_q1331tot828', 'project_q1332tot826', 'project_q1333tot824', 'project_q1334tot787',
                'antelope_66kv', 'project_q1335tot833', 'project_q1336_tot804', 'project_q1338tot801', 'project_q1339tot809', 
                'qc9_phase_ii_study_report_attachment_2_escalated_cost_and_time_to_construct_for_interconnection_facilities_reliability_network_upgrades_delivery_network_upgrades_and_distribution_upgrades_project_q1341tot796_updated_as_of11132017',
                  'project_q1341tot796', 'cost_category_notes_1a_to_1f',
                  'other_potential_network_upgrades_note_1h', 'project_q1344tot797', 'project_q1345tot836', 'project_q1347tot837',
                   'element', 
                   'adn_project_q1295tot789',
                   'adn_element',
                   'adn_project_q1296tot834', 'otherpotential_project_q1297tot794', 
                   'adn_project_q1297tot794', 'adn_project_q1299tot813', 'adn_project_q1300tot822', 'project_q1301tot825_updated_as_of_2132017', 'adn_project_q1301tot825_updated_as_of_2132017', 'adn_project_q1301tot825', 'project_q1302tot830_updated_as_of_2132017', 'adn_project_q1302tot830_updated_as_of_2132017', 'adn_project_q1302tot830', 'adn_project_q1305tot786_updated_as_of_2162017',
                     'adn_project_q1305tot786', 'adn_project_q1306tot792', 'adn_project_q1307tot820', 'otherpotential_other_potential_network_upgrades_note_1112', 
 
            'adn_project_q1309tot805', 'adn_project_q1310tot803', 'adn_project_q1311tot802', 'adn_project_q1312tot812', 'adn_project_q1313tot811', 'adn_project_q1314tot810',
              'adn_project_q1315tot829', 'adn_project_q1316_tot838_updated_as_of_2162017', 'adn_project_q1316_tot838', 'project_q1317tot791_updated_as_of_2132017', 
              'adn_project_q1317tot791_updated_as_of_2132017', 'adn_project_q1317tot791', 'project_q1318tot790_updated_as_of_2132017', 'adn_project_q1318tot790_updated_as_of_2132017', 
              'adn_project_q1318tot790', 'adn_project_q1319tot793', 'project_q1319tot793_updated_as_of_2132017', 'adn_project_q1319tot793_updated_as_of_2132017', 'adn_project_q1320tot807', 
              'project_q1320tot807_updated_as_of_2132017', 'adn_project_q1320tot807_updated_as_of_2132017', 'adn_project_q1321tot814', 'project_q1322tot795_updated_as_of_2132017', 
              'adn_project_q1322tot795_updated_as_of_2132017', 'adn_project_q1322tot795', 'project_q1323tot821_updated_as_of_2132017', 'adn_project_q1323tot821_updated_as_of_2132017', 'adn_project_q1323tot821',
                'adn_project_q1324tot819', 'project_q1324tot819_updated_as_of_2132017', 'adn_project_q1324tot819_updated_as_of_2132017', 'adn_project_q1325tot818', 'adn_project_q1326tot817', 
                'project_q1327tot806_updated_as_of_2132017', 'adn_project_q1327tot806_updated_as_of_2132017', 'adn_project_q1327tot806', 'project_q1328tot815_updated_as_of_2132017',
                'adn_project_q1328tot815_updated_as_of_2132017', 'adn_project_q1328tot815', 'project_q1329tot823_updated_as_of_2132017', 'adn_project_q1329tot823_updated_as_of_2132017',
                  'adn_project_q1329tot823', 'adn_project_q1330_tot827', 'adn_project_q1331tot828', 'project_q1331tot828_updated_as_of_2132017', 'adn_project_q1331tot828_updated_as_of_2132017', 
                  'project_q1332tot826_updated_as_of_2132017', 'adn_project_q1332tot826_updated_as_of_2132017', 'adn_project_q1332tot826', 'adn_project_q1333tot824', 'adn_project_q1334tot787', 
                  'adn_project_q1335tot833', 'otherpotential_project_q1336_tot804', 'adn_project_q1336_tot804', 'adn_project_q1338tot801', 'adn_project_q1339tot809', 'otherpotential_project_q1341tot796', 
            'adn_project_q1341tot796', 'otherpotential_project_q1344tot797', 'adn_project_q1344tot797', 'otherpotential_project_q1345tot836', 'adn_project_q1345tot836', 'otherpotential_project_q1347tot837', 'adn_project_q1347tot837',
            "unnamed_8",
           
         
            
 
             
        ],
        "escalated_cost_x_1000": [
            "total_escalated_costs_wo_itcc",
            "total_estimated_costs_x_1000_escalated_constant_dollars_od_year",
            "none_4",
             'otherpotential_none_4',
            'total_escalated_costs_in_1000s',
            'escalated_cost_in_1000s_note_8',
            'total_estimated_costs_x',
             'total_escalated_costs_in_1000s', 
             'total_escalated_costs_to_od_year_in_1000s',
              'total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
              
              'adn_unnamed_5',
              'adn_none_4',
              'total_estimated_costs_x_1000_escalated_constant_dollars_2022', 
             
              'total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
              'total_estimated_costs_x_1000_escalated_constant_dollars_2021',
              'adn_total_estimated_costs_x_1000_escalated_constant_dollars_2022',
              'adn_total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
             
              
            

        ],
        "estimated_cost_x_1000": [
            "none_3",
            'total_costs_wo_itcc_cab',
            'total_estimated_costs_x_1000_constant_dollar_2017',
             'otherpotential_none_3',
            
            'adn_unnamed_2',
            'adn_none_3',
            'adn_total_estimated_costs_x_1000_constant_dollar_2017',
            'total_estimated_costs_x_1000_constant_dollar_2018',
        
            'total_estimated_costs_x_1000_constant_dollar_2016', 
            'total_estimated_costs_x_1000_escalated_constant_dollars_2021', 
            'total_estimated_costs_x_1000_constant_dollar_2017', 
            'adn_interconnection_facilities_costs_x_1000_constant_dollar_2016', 'adn_reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 
            'adn_delivery_network_upgrades_costs_x_1000_constant_dollar_2016', 
            'adn_distribution_upgrades_costs_x_1000_constant_dollar_2016',
              'adn_total_estimated_costs_x_1000_constant_dollar_2016', 
             

           #     'interconnection_facilities_costs_x_1000_constant_dollar_2016', 'reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 
            #'delivery_network_upgrades_costs_x_1000_constant_dollar_2016', 'distribution_upgrades_costs_x_1000_constant_dollar_2016', 
 
        ],
        "estimated_time_to_construct": [
            'estimated_time_for_licensing_permitting_construction_months',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months2',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months_note_1g',
            'estimated_time_to_construct_months_note_1g',
            'estimated_time_to_construct_months',
            "none_5",
            'estimated_time_to_construct_months',
            'upgrade_duration_months',
            'estimated_time_to_construct_months_note_12', 
            'adnu_duration_months',
            'estimated_time_to',
      
            'none_12',
            'estimated_duration_months',
            'otherpotential_none_5',
            'adn_unnamed_8', 
            'adn_none_5',
            'adn_estimated_time_to_construct_months', 
            'estimated_time_to_construct_months_note_345_6', 
            'estimated_time_to_construct_months_note_345_9_10',
            'estimated_time_to_construct_months_note_1g',
            'adn_estimated_time_to_construct_months_note_345_6', 
            'adn_estimated_time_to_construct_months_note_345_9_10',
            
        ],
        "description": ["description"],
        "capacity": [
            "capacity",
            "project size",
            "project mw",
            "mw at poi"
        ],
 
        "max_time_to_construct": [
            'maximum_escalation_duration_months',
            'od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months_note1g',
            'od_dollar_escalation_duration_months_note1g',
            "none_6",
            'maximum_escalation_duration_months',
            'od_dollar_escalation',
            'maximum_project_duration_months',
            'otherpotential_none_6',
            'adn_unnamed_11', 
            'adn_none_6',
            
            'adn_od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months_note_345_6',
            'od_dollar_escalation_duration_months_note_345_9_10', 
             'od_dollar_escalation_duration_months_note_1g',
             'adn_od_dollar_escalation_duration_months_note_345_6',
             'adn_od_dollar_escalation_duration_months_note_345_9_10',
            
        ]



    }

        # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["type_of_upgrade"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)
df.drop([   'adn_none', 'adn_none_7','otherpotential_none_2','otherpotential_none', 'otherpotential_none_7',
         'one_time_costs_b','none_8', 'none_9', 'none_7' ,'none_10',  'interconnection_facilities_costs_x_1000_constant_dollar_2016', 'reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 
            'delivery_network_upgrades_costs_x_1000_constant_dollar_2016', 'distribution_upgrades_costs_x_1000_constant_dollar_2016', ], axis=1, inplace=True)








 


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
 
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)



df = df[~df.apply(lambda row: row.astype(str).isin(["Constant 2016 Dollar in $1000s (Estimate)", "Eastern","Note (h)"]).any(), axis=1)]
df = df[~df.apply(lambda row: any(str(cell).startswith("Project #:") for cell in row), axis=1)]
df = df[~df.apply(lambda row: any(str(cell).startswith("Location Constrained Resource Interconnection Facilities") for cell in row), axis=1)]



df = df[df['type_of_upgrade'].notna() & (df['type_of_upgrade'].astype(str).str.strip() != "")]

df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J.csv', index=False)


def process_upgrade_columns(df):
    """
    Given a DataFrame df with a column "type_of_upgrade" that contains both group headers and upgrade data,
    this function:
      1. Inserts a new column "upgrade" as a duplicate of "type_of_upgrade" (placed immediately after it).
      2. Renames rows in "type_of_upgrade" that contain specific phrases as follows:
           - If it contains "Interconnection Facilities", rename to "PTO_IF" (or "PTO_IF Total" if "Total" is present)
           - If it contains "Reliability Network Upgrade", rename to "RNU" (or "RNU Total" if "Total" is present)
           - If it contains "Local Delivery Network Upgrades", rename to "LDNU" (or "LDNU Total" if "Total" is present)
           - If it contains "Area Deliverability Network Upgrades", rename to "ADNU" (or "ADNU Total" if "Total" is present)
           - If it contains "Distribution Upgrades", leave it as is.
      3. Creates a temporary column that only holds the header values (from rows that were detected as header rows) and forward-fills it downward.
         The forward fill stops (i.e. does not fill into a row) if that row’s original "type_of_upgrade" contains any of the "total" indicators.
      4. Replaces "type_of_upgrade" with the forward-filled header values.
      5. Drops the rows that originally were header rows.
      6. This deletes any rows which are either Total or Subtotal or Total cost assigned, the reason is some proejcts have multiple pdfs thus we rather calculate the total in the end.
      
    Returns the updated DataFrame.
    """
    import pandas as pd
    
    # 1. Create a new column "upgrade" immediately after "type_of_upgrade"
    loc = df.columns.get_loc("type_of_upgrade")
    df.insert(loc+1, "upgrade", df["type_of_upgrade"])
    
    # 2. Define a helper to rename header rows.
    def rename_header(val):
        # If the cell contains any of these phrases, rename accordingly.
        # We'll check using the substring test (case-sensitive) per your request.
        if "Interconnection Facilities" in val:
            return "PTO_IF" + (" Total" if "Total" in val else "")
        elif "Reliability Network Upgrade" in val:
            return "RNU" + (" Total" if "Total" in val else "")
        elif "Local Delivery Network Upgrades" in val:
            return "LDNU" + (" Total" if "Total" in val else "")
        elif "Area Deliverability Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        elif "Distribution Upgrades" in val:
            return val  # leave unchanged
        elif "Conditional Assigned Network Upgrades" in val:
            return  ("Total " if "Total" in val else "") + "CANU" 
        elif "Non-Allocated IRNU" in val:
            return  ("Total " if "Total" in val else "") + "Non-Allocated IRNU"
        elif "Area Delivery Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        else:
            return val
    
    # 3. Identify header rows. We consider a row to be a header row if its "type_of_upgrade" cell 
    # contains any of the target phrases.
    target_phrases = [
        "Interconnection Facilities",
        "Reliability Network Upgrade",
        "Local Delivery Network Upgrades",
        "Area Deliverability Network Upgrades",
        "Distribution Upgrades",
        "Conditional Assigned Network Upgrades",
        "Non-Allocated IRNU",
        "Area Delivery Network Upgrades",

    ]
    # Create a boolean mask for header rows.
    header_mask = df["type_of_upgrade"].apply(lambda x: any(phrase in x for phrase in target_phrases))
    
    # Apply renaming to the header rows.
    df.loc[header_mask, "type_of_upgrade"] = df.loc[header_mask, "type_of_upgrade"].apply(rename_header)
    
    # 4. Create a temporary column 'header_temp' that holds only the header rows, then forward fill it.
    df["header_temp"] = df["type_of_upgrade"].where(header_mask)
    df["header_temp"] = df["header_temp"].ffill()
    
    # We want to stop the forward fill if we encounter a row that indicates totals.
    # Define a simple function that returns True if a cell contains "Total" or "Subtotal" or "Total cost assigned".
    def is_total_indicator(val):
        return ("Total" in val) or ("Subtotal" in val) or ("Total cost assigned" in val)
    
    # For rows that themselves are total indicators in the "upgrade" column, do not forward-fill (set header_temp to NaN)
    df.loc[df["upgrade"].apply(lambda x: is_total_indicator(x)), "header_temp"] = None
    
    # Now, replace the "type_of_upgrade" column with the forward-filled header
    df["type_of_upgrade"] = df["header_temp"]
    df.drop("header_temp", axis=1, inplace=True)
    
    # 5. Finally, drop the rows that were header rows (i.e. where header_mask is True)
    df = df[~header_mask].reset_index(drop=True)
    
    # Also, drop any rows that have an empty "type_of_upgrade"
    df = df[df["type_of_upgrade"].notna() & (df["type_of_upgrade"].str.strip() != "")]
    
    return df

 

df = process_upgrade_columns(df)


df = df[~df.apply(lambda row: any(str(cell).startswith("Total Escalated Costs w/o ITCC") for cell in row), axis=1)] 




 
mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
 "Non-Allocated IRNU": "RNU",
 "Total Non-Allocated IRNU": "Total RNU",
 }

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )




df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)
   


 
   
 
 


 

 

    
# Step 7: Remove $ signs and convert to numeric
import re

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries


# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)


##################################################################################################################
df = df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade', 'estimated_cost_x_1000', 'escalated_cost_x_1000'])   

df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",


])]



# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000', 'escalated_cost_x_1000']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

                # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = group[
            (group['type_of_upgrade'] == upgrade) & (group['item'] == 'no')
        ].shape[0] > 0
        
        if total_exists:
             
            continue


 
        
        
        if not total_exists:
            # If only one row exists, duplicate it as the total row
            if len(rows) == 1:

                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate specified columns from the existing row
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns (single row, so it remains the same)
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)



 
            
            # If multiple rows exist, sum numeric columns and create a total row
            elif len(rows) > 1:
                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate the specified columns from the first row in the group
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)
 



      

if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    for col in df.columns:
        if col not in total_rows_df.columns:
            total_rows_df[col] = None
    total_rows_df = total_rows_df[df.columns]
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)

# Update 'item' column based on Total in type_of_upgrade or cost_allocation_factor
df['item'] = df.apply(
    lambda row: 'no' if (
        'Total' in str(row.get('type_of_upgrade', '')) or 
        'Total' in str(row.get('cost_allocation_factor', ''))
    ) else 'yes',
    axis=1
)


# Step 8: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]

 



def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

 
mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
'Total PTO_IF': 'PTO_IF',
'PTO_IF Total': 'PTO_IF',
 'Total RNU': 'RNU',
 'RNU Total': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
 'LDNU Total': 'LDNU',
 'Total Distribution Upgrades': 'Distribution Upgrades',
 'Distribution Upgrades Total': 'Distribution Upgrades',
 'Total Potential Distribution Upgrades': 'Potential Distribution Upgrades',
}

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )



 

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
     
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
     
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)
df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",
    "Plan of Service",


])]
df = remove_dollar_values_and_fill_nan(df, 'max_time_to_construct')
# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
 
    #itemized_df = itemized_df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade']) 
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    existing_totals_columns = [col for col in totals_columns if col in df.columns]
    totals_df = df[df['item'] == 'no'].drop(columns=existing_totals_columns, errors='ignore')
    # Define the cost columns.
    cost_cols = ['estimated_cost_x_1000', 'escalated_cost_x_1000']

    # Build an aggregation dictionary:
    # For columns not in grouping or cost_cols, we assume they are identical and take the first value.
    agg_dict = {col: 'first' for col in totals_df.columns 
                if col not in ['q_id', 'type_of_upgrade'] + cost_cols}

    # For the cost columns, we want to sum them.
    agg_dict.update({col: 'sum' for col in cost_cols})

    

    # Group by both q_id and type_of_upgrade using the aggregation dictionary.
    totals_df = totals_df.groupby(['q_id', 'type_of_upgrade'], as_index=False).agg(agg_dict)
    totals_df = reorder_columns(totals_df)
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())



  df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 9/03_raw/rawdata_cluster9_style_J_originals.csv', dtype={'estimated_time_to_construct': str})


Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.
Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.
['PTO_IF' 'RNU' 'ADNU' 'Distribution Upgrades' 'LDNU']
[1295 1296 1297 1299 1300 1301 1302 1305 1306 1307 1308 1309 1310 1311
 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1325 1326
 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1338 1339 1341 1344
 1345 1347]
[9]


  df[col] = df[col].fillna(0)


# Addendum

# Column Names

In [2]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_addendums.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())


 


After cleaning: ['q_id', 'cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection', 'project_q1306tot792_updated_as_of_2222017', 'unnamed_8', 'none_2', 'none_3', 'none_4', 'none_5', 'none_6', 'none_7', 'adn_project_q1306tot792_updated_as_of_2222017', 'adn_none', 'adn_none_2', 'adn_none_3', 'adn_none_4', 'adn_none_5', 'adn_none_6', 'adn_none_7', 'project_q1307tot820_updated_as_of_2222017', 'adn_project_q1307tot820_updated_as_of_2222017', 'project_q1308tot808_updated_as_of_2222017', 'adn_project_q1308tot808_updated_as_of_2222017', 'project_q1309tot805_updated_as_of_2222017', 'adn_project_q1309tot805_updated_as_of_2222017', 'project_q1310tot803_updated_as_of_2222017', 'adn_project_q1310tot803_updated_as_of_2222017', 'project_q1312tot812_updated_as_of_2222017', 'adn_project_q1312tot812_updated_as_of_2222017', 'project_q1313tot811_updated_as_of_2222017', 'adn_project_q1313tot811_updated_as_of_2222017', 'project_q1314tot810_updated_as_of_2222017', 'adn_

In [3]:
import pandas as pd
import re
import unicodedata

# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized_addendums.csv', dtype={'estimated_time_to_construct': str})

df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.



df.columns = clean_column_headers(df.columns)

def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]
print("After cleaning:", df.columns.tolist())


After cleaning: ['q_id', 'cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection', 'type_of_upgrade', 'upgrade', 'estimated_cost_x_1000', 'escalated_cost_x_1000', 'estimated_time_to_construct', 'item', 'max_time_to_construct']


# Itmeized and Total

In [10]:
import pandas as pd
import re
import unicodedata
import numpy as np
# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_addendums.csv', dtype={'estimated_time_to_construct': str})

#df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')




def clean_column_headers(headers):
    """Cleans column headers by normalizing and removing unwanted characters."""
    cleaned_headers = []  # Initialize an empty list to hold the cleaned header names.
    for header in headers:  # Iterate over each header in the input.
        if header is None:
            header = ""  # If the header is None, set it to an empty string.
        elif isinstance(header, str):  # Otherwise, if the header is a string:
            #header = header.lower()  # Convert the header to lowercase.
            header = re.sub(r'\s+', ' ', header)  # Replace one or more whitespace characters with a single space.
            #header = re.sub(r'\(.*?\)', '', header)  # Remove any text within parentheses (non-greedy).
            header = re.sub(r'[^a-zA-Z0-9\s()/+=_]', '', header)  # Remove any character that is not a letter, number, or whitespace.
            header = header.strip()  # Remove any leading or trailing whitespace.
        cleaned_headers.append(header)  # Append the cleaned header to the list.
    return cleaned_headers  # Return the list of cleaned headers.

df.columns = clean_column_headers(df.columns)

 



#STEP 2: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]




def move_dollar_values(df, source_column, target_column):
    """
    For each row in the DataFrame, if the value in `source_column` starts with a '$',
    move that value to `target_column` and clear the value in the source column.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      source_column (str): The column to check for values starting with '$'.
      target_column (str): The column to move the values into.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure target_column exists; if not, create it filled with empty strings.
    if target_column not in df.columns:
        df[target_column] = ""
    
    # Create a boolean mask for rows where the source column starts with '$'
    mask = df[source_column].astype(str).str.startswith('$', na=False)
    
    # Move the values: assign the source values to the target column where the mask is True.
    df.loc[mask, target_column] = df.loc[mask, source_column]
    
    # Clear the source column values for those rows (set to empty string)
    df.loc[mask, source_column] = ""
    
    return df

# Move values from 'unnamed_8' to a new column 'moved_value'
#df = move_dollar_values(df, 'none_5', 'total_estimated_costs_x_1000_escalated_constant_dollars_od_year')


#df = move_dollar_values(df, 'none_3','total_estimated_costs_x_1000_constant_dollar_2020')

def remove_dollar_values_and_fill_nan(df, column):
    """
    For each row in the DataFrame, if the value in the specified column starts with '$',
    set that value to NaN. Also, replace any empty strings in that column with NaN.

    Parameters:
      df (pd.DataFrame): The input DataFrame.
      column (str): The column to check and clean.

    Returns:
      pd.DataFrame: The modified DataFrame.
    """
    # Ensure the column is treated as string
    df[column] = df[column].astype(str)
    
    # Set values starting with '$' to NaN
    mask = df[column].str.startswith('$', na=False)
    df.loc[mask, column] = np.nan
    
    # Replace any remaining empty strings with NaN
    df[column] = df[column].replace("", np.nan)
    
    return df

df = remove_dollar_values_and_fill_nan(df, 'unnamed_8')



def insert_phrase_if_missing(df, target_columns, phrase="Area Delivery Network Upgrades"):
    """
    For each q_id group in df, for each column in target_columns:
      - Check the first row (by current order) that is non-empty (not NaN, not blank, not 'na').
      - If that cell does NOT contain the given phrase (case-insensitive),
        insert a new row above it (within that q_id group) that sets that column to phrase,
        leaving other columns as empty strings.
    
    Returns a new DataFrame with the added rows.
    """
    # Work on a copy and reset index; create a temporary "order" column for precise control.
    df = df.copy().reset_index(drop=True)
    df["order"] = df.index.astype(float)
    
    new_rows = []
    
    # Process each q_id group
    for qid, group in df.groupby("q_id", sort=False):
        # For each target column, find the first non-empty cell.
        for col in target_columns:
            # We'll iterate over the group's index in order.
            first_idx = None
            first_val = None
            for idx in group.index:
                val = group.loc[idx, col]
                # Convert to string and strip spaces.
                # Treat NaN, empty strings, or "na"/"NA" as empty.
                val_str = "" if pd.isna(val) else str(val).strip()
                if val_str == "" or val_str.lower() == "na":
                    continue
                first_idx = idx
                first_val = val_str
                break
            # If we found a non-empty value and it doesn't contain our phrase, insert a row.
            if first_idx is not None:
                if phrase.lower() not in first_val.lower():
                    # Create a new row with the same q_id; leave all columns blank except col.
                    new_row = {c: "" for c in df.columns if c != "order"}
                    new_row["q_id"] = qid
                    new_row[col] = phrase
                    # Set its order to be a bit before the first non-empty row.
                    new_order = group.loc[first_idx, "order"] - 0.1
                    new_row["order"] = new_order
                    new_rows.append(new_row)
    
    # If any rows were added, append them and sort by q_id then order.
    if new_rows:
        new_rows_df = pd.DataFrame(new_rows)
        df = pd.concat([df, new_rows_df], ignore_index=True)
        df = df.sort_values(by=["q_id", "order"]).reset_index(drop=True)
    
    # Drop the temporary order column.
    df = df.drop(columns=["order"])
    return df


df = insert_phrase_if_missing(df, ['adn_area_delivery_network_upgrades'], phrase="Area Delivery Network Upgrades")  



 


def merge_columns(df):

    merge_columns_dict = {


  

        "one_time_costs_b": [
            "none_2",
             
        ],


 
 

 
    
        "type_of_upgrade": [
            
             'cost_category', 
           
            "cost_category_notes_1a_to_1f",
           
            'other_potential_cost', 
            'other_potential_network_cost',
            'area_delivery_network_upgrades', 
            'otherpotential_other_potential_network_upgrades_note_1h',
            'adn_area_delivery_network_upgrades',
         'project_q1306tot792_updated_as_of_2222017',
         'project_q1307tot820_updated_as_of_2222017', 'project_q1309tot805_updated_as_of_2222017', 
         'project_q1310tot803_updated_as_of_2222017', 'project_q1312tot812_updated_as_of_2222017', 
         'project_q1313tot811_updated_as_of_2222017', 'project_q1314tot810_updated_as_of_2222017', 'project_q1315tot829_updated_as_of_2222017', 
          'element',
          'other_potential_network_upgrades_note_1112', 'area_delivery_network_upgrades'
          'adn_project_q1306tot792_updated_as_of_2222017',   'adn_project_q1307tot820_updated_as_of_2222017', 'project_q1308tot808_updated_as_of_2222017', 
          'adn_project_q1308tot808_updated_as_of_2222017', 'adn_project_q1309tot805_updated_as_of_2222017', 'adn_project_q1310tot803_updated_as_of_2222017', 'adn_project_q1312tot812_updated_as_of_2222017', 
          'adn_project_q1313tot811_updated_as_of_2222017',
          'adn_project_q1314tot810_updated_as_of_2222017', 'adn_project_q1315tot829_updated_as_of_2222017', 'otherpotential_other_potential_network_upgrades_note_1112',
          'project_q1307tot820_updated_as_of_2222017', 'adn_project_q1307tot820_updated_as_of_2222017', 'project_q1308tot808_updated_as_of_2222017', 'adn_project_q1308tot808_updated_as_of_2222017',
            'project_q1309tot805_updated_as_of_2222017', 'adn_project_q1309tot805_updated_as_of_2222017', 'project_q1310tot803_updated_as_of_2222017', 'adn_project_q1310tot803_updated_as_of_2222017', 
            'project_q1312tot812_updated_as_of_2222017', 'adn_project_q1312tot812_updated_as_of_2222017', 'project_q1313tot811_updated_as_of_2222017', 'adn_project_q1313tot811_updated_as_of_2222017', 
            'project_q1314tot810_updated_as_of_2222017', 
          'adn_project_q1314tot810_updated_as_of_2222017', 'project_q1315tot829_updated_as_of_2222017', 'adn_project_q1315tot829_updated_as_of_2222017', 'element', 
          'otherpotential_other_potential_network_upgrades_note_1112',
          'adn_project_q1306tot792_updated_as_of_2222017',
          
          'unnamed_8'
 
            
           
         
            
 
             
        ],
        "escalated_cost_x_1000": [
            "total_escalated_costs_wo_itcc",
            "total_estimated_costs_x_1000_escalated_constant_dollars_od_year",
            "none_4",
            'total_escalated_costs_in_1000s',
            'escalated_cost_in_1000s_note_8',
            'total_estimated_costs_x',
             'total_escalated_costs_in_1000s', 
             'total_escalated_costs_to_od_year_in_1000s',
              'total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
              'otherpotential_none_2',
              'adn_unnamed_5',
              'adn_none_4',
              'total_estimated_costs_x_1000_escalated_constant_dollars_2022', 
             
              'total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
              'adn_none_4', 
              
            

        ],
        "estimated_cost_x_1000": [
            "none_3",
            'total_costs_wo_itcc_cab',
            'total_estimated_costs_x_1000_constant_dollar_2017',
            'otherpotential_none',
            'adn_unnamed_2',
            'adn_none_3',
            
            'total_estimated_costs_x_1000_constant_dollar_2018',
            'interconnection_facilities_costs_x_1000_constant_dollar_2016', 'reliability_network_upgrades_costs_x_1000_constant_dollar_2016', 
            'delivery_network_upgrades_costs_x_1000_constant_dollar_2016', 'distribution_upgrades_costs_x_1000_constant_dollar_2016', 
            'total_estimated_costs_x_1000_constant_dollar_2016', 
            'total_estimated_costs_x_1000_escalated_constant_dollars_2021', 
            'total_estimated_costs_x_1000_constant_dollar_2017', 
            'adn_none_3', 
           
 
        ],
        "estimated_time_to_construct": [
            'estimated_time_for_licensing_permitting_construction_months',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months2',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months',
            'estimated_time_to_construct_months_note_1g',
            'estimated_time_to_construct_months_note_1g',
            'estimated_time_to_construct_months',
            "none_5",
            'estimated_time_to_construct_months',
            'upgrade_duration_months',
            'estimated_time_to_construct_months_note_12', 
            'adnu_duration_months',
            'estimated_time_to',
      
            'none_12',
            'estimated_duration_months',
            'otherpotential_none_3',
            'adn_unnamed_8', 
            'adn_none_5',
            'adn_estimated_time_to_construct_months', 
            'estimated_time_to_construct_months_note_345_6', 
            'estimated_time_to_construct_months_note_345_9_10',
            'estimated_time_to_construct_months_note_1g',
           
             'adn_none_5', 
             
            
        ],
        "description": ["description"],
        "capacity": [
            "capacity",
            "project size",
            "project mw",
            "mw at poi"
        ],
 
        "max_time_to_construct": [
            'maximum_escalation_duration_months',
            'od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months_note1g',
            'od_dollar_escalation_duration_months_note1g',
            "none_6",
            'maximum_escalation_duration_months',
            'od_dollar_escalation',
            'maximum_project_duration_months',
            'otherpotential_none_4',
            'adn_unnamed_11', 
            'adn_none_6',
            
            'adn_od_dollar_escalation_duration_months',
            'od_dollar_escalation_duration_months_note_345_6',
            'od_dollar_escalation_duration_months_note_345_9_10', 
             'od_dollar_escalation_duration_months_note_1g',
             
             'adn_none_6', 
             
            
        ]



    }

        # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["type_of_upgrade"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)
df.drop([   'adn_none', 'adn_none_2', 'adn_none_7', 
         'one_time_costs_b','none_8', 'none_9', 'none_7' ,'none_10',  'adn_total_estimated_costs_x_1000_constant_dollar_2017', 'adn_total_estimated_costs_x_1000_escalated_constant_dollars_od_year',
         'adn_od_dollar_escalation_duration_months_note_345_9_10','adn_estimated_time_to_construct_months_note_345_9_10', 
], axis=1, inplace=True)








 


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
 
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)



df = df[~df.apply(lambda row: row.astype(str).isin(["Total Escalated Costs w/o ITCC", "Constant 2017 Dollar in $1000s (Estimate)", "Eastern","Note (h)"]).any(), axis=1)]
df = df[~df.apply(lambda row: any(str(cell).startswith("Project #:") for cell in row), axis=1)]


df = df[df['type_of_upgrade'].notna() & (df['type_of_upgrade'].astype(str).str.strip() != "")]

 
def process_upgrade_columns(df):
    """
    Given a DataFrame df with a column "type_of_upgrade" that contains both group headers and upgrade data,
    this function:
      1. Inserts a new column "upgrade" as a duplicate of "type_of_upgrade" (placed immediately after it).
      2. Renames rows in "type_of_upgrade" that contain specific phrases as follows:
           - If it contains "Interconnection Facilities", rename to "PTO_IF" (or "PTO_IF Total" if "Total" is present)
           - If it contains "Reliability Network Upgrade", rename to "RNU" (or "RNU Total" if "Total" is present)
           - If it contains "Local Delivery Network Upgrades", rename to "LDNU" (or "LDNU Total" if "Total" is present)
           - If it contains "Area Deliverability Network Upgrades", rename to "ADNU" (or "ADNU Total" if "Total" is present)
           - If it contains "Distribution Upgrades", leave it as is.
      3. Creates a temporary column that only holds the header values (from rows that were detected as header rows) and forward-fills it downward.
         The forward fill stops (i.e. does not fill into a row) if that row’s original "type_of_upgrade" contains any of the "total" indicators.
      4. Replaces "type_of_upgrade" with the forward-filled header values.
      5. Drops the rows that originally were header rows.
      6. This deletes any rows which are either Total or Subtotal or Total cost assigned, the reason is some proejcts have multiple pdfs thus we rather calculate the total in the end.
      
    Returns the updated DataFrame.
    """
    import pandas as pd
    
    # 1. Create a new column "upgrade" immediately after "type_of_upgrade"
    loc = df.columns.get_loc("type_of_upgrade")
    df.insert(loc+1, "upgrade", df["type_of_upgrade"])
    
    # 2. Define a helper to rename header rows.
    def rename_header(val):
        # If the cell contains any of these phrases, rename accordingly.
        # We'll check using the substring test (case-sensitive) per your request.
        if "Interconnection Facilities" in val:
            return "PTO_IF" + (" Total" if "Total" in val else "")
        elif "Reliability Network Upgrade" in val:
            return "RNU" + (" Total" if "Total" in val else "")
        elif "Local Delivery Network Upgrades" in val:
            return "LDNU" + (" Total" if "Total" in val else "")
        elif "Area Deliverability Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        elif "Distribution Upgrades" in val:
            return val  # leave unchanged
        elif "Conditional Assigned Network Upgrades" in val:
            return  ("Total " if "Total" in val else "") + "CANU" 
        elif "Non-Allocated IRNU" in val:
            return  ("Total " if "Total" in val else "") + "Non-Allocated IRNU"
        elif "Area Delivery Network Upgrades" in val:
            return "ADNU" + (" Total" if "Total" in val else "")
        else:
            return val
    
    # 3. Identify header rows. We consider a row to be a header row if its "type_of_upgrade" cell 
    # contains any of the target phrases.
    target_phrases = [
        "Interconnection Facilities",
        "Reliability Network Upgrade",
        "Local Delivery Network Upgrades",
        "Area Deliverability Network Upgrades",
        "Distribution Upgrades",
        "Conditional Assigned Network Upgrades",
        "Non-Allocated IRNU",
        "Area Delivery Network Upgrades",

    ]
    # Create a boolean mask for header rows.
    header_mask = df["type_of_upgrade"].apply(lambda x: any(phrase in x for phrase in target_phrases))
    
    # Apply renaming to the header rows.
    df.loc[header_mask, "type_of_upgrade"] = df.loc[header_mask, "type_of_upgrade"].apply(rename_header)
    
    # 4. Create a temporary column 'header_temp' that holds only the header rows, then forward fill it.
    df["header_temp"] = df["type_of_upgrade"].where(header_mask)
    df["header_temp"] = df["header_temp"].ffill()
    
    # We want to stop the forward fill if we encounter a row that indicates totals.
    # Define a simple function that returns True if a cell contains "Total" or "Subtotal" or "Total cost assigned".
    def is_total_indicator(val):
        return ("Total" in val) or ("Subtotal" in val) or ("Total cost assigned" in val)
    
    # For rows that themselves are total indicators in the "upgrade" column, do not forward-fill (set header_temp to NaN)
    df.loc[df["upgrade"].apply(lambda x: is_total_indicator(x)), "header_temp"] = None
    
    # Now, replace the "type_of_upgrade" column with the forward-filled header
    df["type_of_upgrade"] = df["header_temp"]
    df.drop("header_temp", axis=1, inplace=True)
    
    # 5. Finally, drop the rows that were header rows (i.e. where header_mask is True)
    df = df[~header_mask].reset_index(drop=True)
    
    # Also, drop any rows that have an empty "type_of_upgrade"
    df = df[df["type_of_upgrade"].notna() & (df["type_of_upgrade"].str.strip() != "")]
    
    return df

 

df = process_upgrade_columns(df)


 




 
mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
 "Non-Allocated IRNU": "RNU",
 "Total Non-Allocated IRNU": "Total RNU",
 }

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )




df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)
   


 
   
 
 


 

 

    
# Step 7: Remove $ signs and convert to numeric
import re

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries


# Clean the specific columns
for col in ['estimated_cost_x_1000', 'escalated_cost_x_1000']:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)


##################################################################################################################
df = df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade', 'estimated_cost_x_1000', 'escalated_cost_x_1000'])   
df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",


])]

df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/03_raw/rawdata_cluster9_style_J_add.csv', index=False)

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000', 'escalated_cost_x_1000']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

                # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = group[
            (group['type_of_upgrade'] == upgrade) & (group['item'] == 'no')
        ].shape[0] > 0
        
        if total_exists:
             
            continue


 
        
        
        if not total_exists:
            # If only one row exists, duplicate it as the total row
            if len(rows) == 1:

                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate specified columns from the existing row
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns (single row, so it remains the same)
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)



 
            
            # If multiple rows exist, sum numeric columns and create a total row
            elif len(rows) > 1:
                total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings

                # Populate the necessary fields
                total_row['q_id'] = q_id
                total_row['type_of_upgrade'] = f"Total {upgrade}"
                total_row['item'] = 'no'

                # Populate the specified columns from the first row in the group
                first_row = rows.iloc[0]
                for col in columns_to_populate:
                    if col in df.columns:
                        total_row[col] = first_row[col]

                # Sum the numeric columns
                for col in columns_to_sum:
                    if col in rows.columns:
                        total_row[col] = rows[col].sum()
                    else:
                        total_row[col] = 0  # Default to 0 if column is missing

                new_rows.append(total_row)
 



      

if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    for col in df.columns:
        if col not in total_rows_df.columns:
            total_rows_df[col] = None
    total_rows_df = total_rows_df[df.columns]
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)

# Update 'item' column based on Total in type_of_upgrade or cost_allocation_factor
df['item'] = df.apply(
    lambda row: 'no' if (
        'Total' in str(row.get('type_of_upgrade', '')) or 
        'Total' in str(row.get('cost_allocation_factor', ''))
    ) else 'yes',
    axis=1
)


# Step 8: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]

 



def clean_estimated_time(value):
    if isinstance(value, str):
        value = re.sub(r'(\d+(?:-\w+)*)\s+\w+.*$', r'\1', value, flags=re.IGNORECASE).strip()
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

 
mappings = {
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "PTO's Interconnection Facilities": "PTO_IF",
 "RNUs, Estimated Costs, and Estimated Time to Construct Summary": "RNU",
'Total PTO_IF': 'PTO_IF',
'PTO_IF Total': 'PTO_IF',
 'Total RNU': 'RNU',
 'RNU Total': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
 'LDNU Total': 'LDNU',
 'Total Distribution Upgrades': 'Distribution Upgrades',
 'Distribution Upgrades Total': 'Distribution Upgrades',
 'Total Potential Distribution Upgrades': 'Potential Distribution Upgrades',
}

if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )



 

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
     
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
     
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df        


df= reorder_columns(df)
df = df[~df['upgrade'].astype(str).isin([
    "CANUIRNU", 
    "CANUGRNU", 
    "SCD",
    "CANU-LDNU",
    "IRNUs",
    "GRNUs",
    "Maximum Cost Responsibility (Network Upgrades)",
    "Network Upgrade",
    "Plan of Service",


])]
df = remove_dollar_values_and_fill_nan(df, 'max_time_to_construct')
# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
 
    #itemized_df = itemized_df.drop_duplicates(subset=['q_id','type_of_upgrade','upgrade']) 
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized_addendums.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    existing_totals_columns = [col for col in totals_columns if col in df.columns]
    totals_df = df[df['item'] == 'no'].drop(columns=existing_totals_columns, errors='ignore')
    # Define the cost columns.
    cost_cols = ['estimated_cost_x_1000', 'escalated_cost_x_1000']

    # Build an aggregation dictionary:
    # For columns not in grouping or cost_cols, we assume they are identical and take the first value.
    agg_dict = {col: 'first' for col in totals_df.columns 
                if col not in ['q_id', 'type_of_upgrade'] + cost_cols}

    # For the cost columns, we want to sum them.
    agg_dict.update({col: 'sum' for col in cost_cols})

    

    # Group by both q_id and type_of_upgrade using the aggregation dictionary.
    totals_df = totals_df.groupby(['q_id', 'type_of_upgrade'], as_index=False).agg(agg_dict)
    totals_df = reorder_columns(totals_df)
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_total_addendums.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())



Itemized rows saved to 'costs_phase_1_cluster_14_itemized.csv'.
Filtered Total rows saved to 'costs_phase_1_cluster_14_total.csv'.
['PTO_IF' 'RNU' 'ADNU' 'LDNU' 'Distribution Upgrades']
[1306 1307 1308 1309 1310 1312 1313 1314 1315 1344]
[9]


  df[col] = df[col].fillna(0)


# Merge Original and addendum

In [11]:
import pandas as pd
import numpy as np

def load_data(file_path, char_columns):
    """
    Load a CSV file and ensure specific columns are treated as character, others as numeric.
    """
 # Get columns available in the dataset
    available_columns = pd.read_csv(file_path, nrows=0).columns
    
    # Restrict to char_columns that are present in the dataset
    char_columns_in_dataset = [col for col in char_columns if col in available_columns]
    
    # Load the dataset, treating char_columns_in_dataset as strings
    df = pd.read_csv(
        file_path,
        dtype={col: str for col in char_columns_in_dataset},
        na_values=[],  # Disable automatic NaN interpretation
        keep_default_na=False  # Prevent treating "None" as NaN
    )

    
    
    
    # Convert all other columns to numeric
    #for col in df.columns:
    #    if col not in char_columns:
    #        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
    
    return df

def save_data(df, file_path, char_columns):
    """
    Save a dataframe to a CSV file, ensuring specific columns are treated as character.
    """
    for col in char_columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
    df.to_csv(file_path, index=False)

def merge_with_addendums(itemized, itemized_addendums, total, total_addendums):
    # Add an 'original' column to the datasets
    itemized['original'] = "yes"
    total['original'] = "yes"
    
    # Preserve the original row order
    itemized['row_order'] = pd.to_numeric(itemized.index, errors="coerce")
    total['row_order'] = pd.to_numeric(total.index, errors="coerce")
    
    # Ensure q_id is numeric for comparison
    itemized['q_id'] = pd.to_numeric(itemized['q_id'], errors="coerce")
    itemized_addendums['q_id'] = pd.to_numeric(itemized_addendums['q_id'], errors="coerce")
    total['q_id'] = pd.to_numeric(total['q_id'], errors="coerce")
    total_addendums['q_id'] = pd.to_numeric(total_addendums['q_id'], errors="coerce")
    
    # Columns for conditional replacement
    conditional_columns = ["req_deliverability", "latitude", "longitude", "capacity", "point_of_interconnection"]
    
    # --- Process itemized data (unchanged) ---
    updated_itemized_rows = []
    for q_id in itemized_addendums['q_id'].unique():
        for upgrade_type in itemized_addendums['type_of_upgrade'].unique():
            addendum_rows = itemized_addendums[
                (itemized_addendums['q_id'] == q_id) &
                (itemized_addendums['type_of_upgrade'] == upgrade_type)
            ]
            if not addendum_rows.empty:
                mask = (itemized['q_id'] == q_id) & (itemized['type_of_upgrade'] == upgrade_type)
                original_rows = itemized[mask]
                print(f"Processing itemized: q_id={q_id}, type_of_upgrade={upgrade_type}")
                print(f"Length of addendum_rows: {len(addendum_rows)}")
                print(f"Length of original_rows: {len(original_rows)}")
                # For specified columns, replace if addendum values are non-empty
                for col in conditional_columns:
                    if col in addendum_rows.columns and col in original_rows.columns:
                        addendum_rows[col] = addendum_rows[col].replace("", pd.NA)
                        addendum_rows[col] = addendum_rows[col].combine_first(original_rows[col].reset_index(drop=True))
                        addendum_rows[col] = addendum_rows[col].fillna("")
                # Align lengths
                original_rows = original_rows.reset_index(drop=True)
                addendum_rows = addendum_rows.reset_index(drop=True)
                if len(addendum_rows) > len(original_rows):
                    extra_rows = pd.DataFrame({col: pd.NA for col in original_rows.columns},
                                              index=range(len(addendum_rows) - len(original_rows)))
                    original_rows = pd.concat([original_rows, extra_rows], ignore_index=True)
                elif len(addendum_rows) < len(original_rows):
                    original_rows = original_rows.iloc[:len(addendum_rows)].reset_index(drop=True)
                itemized.loc[mask, 'original'] = "no"
                updated_itemized_rows.append(
                    addendum_rows.assign(original="no", row_order=original_rows['row_order'].values[:len(addendum_rows)])
                )
                itemized = itemized[~mask]
    if updated_itemized_rows:
        updated_itemized = pd.concat([itemized] + updated_itemized_rows, ignore_index=True)
    else:
        updated_itemized = itemized.copy()
    updated_itemized["row_order"] = pd.to_numeric(updated_itemized["row_order"], errors="coerce").fillna(-1).astype(int)
    updated_itemized = updated_itemized.sort_values(by="row_order").drop(columns=["row_order"]).reset_index(drop=True)
    
    # --- Process total data ---
    updated_total_rows = []
    for q_id in total_addendums['q_id'].unique():
        for upgrade_type in total_addendums['type_of_upgrade'].unique():
            addendum_row = total_addendums[
                (total_addendums['q_id'] == q_id) &
                (total_addendums['type_of_upgrade'] == upgrade_type)
            ]
            if not addendum_row.empty:
                mask = (total['q_id'] == q_id) & (total['type_of_upgrade'] == upgrade_type)
                original_row = total[mask]
                print(f"Processing total: q_id={q_id}, type_of_upgrade={upgrade_type}")
                # If no matching original row exists, create a default row_order column
                if original_row.empty:
                    original_row = pd.DataFrame({'row_order': [pd.NA] * len(addendum_row)}, index=addendum_row.index)
                else:
                    original_row = original_row.reset_index(drop=True)
                addendum_row = addendum_row.reset_index(drop=True)
                if len(addendum_row) > len(original_row):
                    extra_rows = pd.DataFrame({col: pd.NA for col in original_row.columns},
                                              index=range(len(addendum_row) - len(original_row)))
                    original_row = pd.concat([original_row, extra_rows], ignore_index=True)
                elif len(addendum_row) < len(original_row):
                    original_row = original_row.iloc[:len(addendum_row)].reset_index(drop=True)
                for col in conditional_columns:
                    if col in addendum_row.columns and col in original_row.columns:
                        addendum_row[col] = addendum_row[col].replace("", pd.NA)
                        addendum_row[col] = addendum_row[col].combine_first(original_row[col].reset_index(drop=True))
                        addendum_row[col] = addendum_row[col].fillna("")
                total.loc[mask, 'original'] = "no"
                updated_total_rows.append(
                    addendum_row.assign(original="no", row_order=original_row['row_order'].values[:len(addendum_row)])
                )
                total = total[~mask]
    if updated_total_rows:
        updated_total = pd.concat([total] + updated_total_rows, ignore_index=True)
    else:
        updated_total = total.copy()
    updated_total["row_order"] = pd.to_numeric(updated_total["row_order"], errors="coerce").fillna(-1).astype(int)
    updated_total = updated_total.sort_values(by="row_order").drop(columns=["row_order"]).reset_index(drop=True)
    
    # Fill missing columns with zeros in the updated datasets
    for col in set(itemized.columns) - set(updated_itemized.columns):
        updated_itemized[col] = 0
    for col in set(total.columns) - set(updated_total.columns):
        updated_total[col] = 0

    # Move the 'original' column to the last position
    updated_itemized = updated_itemized[[col for col in updated_itemized.columns if col != 'original'] + ['original']]
    updated_total = updated_total[[col for col in updated_total.columns if col != 'original'] + ['original']]
    
    if "row_order" in updated_itemized.columns:
        updated_itemized = updated_itemized.drop(columns=["row_order"]).reset_index(drop=True)
    if "row_order" in updated_total.columns:
        updated_total = updated_total.drop(columns=["row_order"]).reset_index(drop=True)
    
    return updated_itemized, updated_total


# Define the character columns
char_columns = [
    "req_deliverability", "point_of_interconnection", "type_of_upgrade",
    "upgrade", "description", "estimated_time_to_construct", "original", "item"
]




itemized = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized.csv", char_columns)
itemized_addendums = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_itemized_addendums.csv", char_columns)
total = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_total.csv", char_columns)
total_addendums = load_data("/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/02_intermediate/costs_phase_1_cluster_9_style_J_total_addendums.csv", char_columns)


updated_itemized, updated_total = merge_with_addendums(itemized, itemized_addendums, total, total_addendums)

# Drop the specified columns from the updated datasets
columns_to_drop = [ "upgrade_classification","estimated", "caiso_queue", "project_type", "dependent_system_upgrade"]

# For the itemized dataset
updated_itemized = updated_itemized.drop(columns=[col for col in columns_to_drop if col in updated_itemized.columns], errors='ignore')

# For the total dataset
updated_total = updated_total.drop(columns=[col for col in columns_to_drop if col in updated_total.columns], errors='ignore')



# List of columns to process with ffill and bfill
columns_to_fill = ["point_of_interconnection", "latitude", "longitude", "req_deliverability", "capacity"]

# Replace empty strings with NaN for the specified columns
for col in columns_to_fill:
    updated_itemized[col] = updated_itemized[col].replace('', np.nan)
    updated_total[col] = updated_total[col].replace('', np.nan)

# Sort by q_id while maintaining other column order (stable sorting)
updated_itemized = updated_itemized.sort_values(by=["q_id"], kind="stable").reset_index(drop=True)
updated_total = updated_total.sort_values(by=["q_id"], kind="stable").reset_index(drop=True)

# Apply forward-fill and backward-fill for the specified columns within each q_id group
for col in columns_to_fill:
    updated_itemized[col] = (
        updated_itemized.groupby("q_id")[col]
        .apply(lambda group: group.ffill().bfill())
        .reset_index(drop=True)
    )
    updated_total[col] = (
        updated_total.groupby("q_id")[col]
        .apply(lambda group: group.ffill().bfill())
        .reset_index(drop=True)
    )

# Replace NaN back with empty strings for consistency
for col in columns_to_fill:
    updated_itemized[col] = updated_itemized[col].replace(np.nan, '')
    updated_total[col] = updated_total[col].replace(np.nan, '')

 

 







# Save the updated datasets
save_data(updated_itemized, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/01_clean/costs_phase_1_cluster_9_style_J_itemized_updated.csv", char_columns)
save_data(updated_total, "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 9/01_clean/costs_phase_1_cluster_9_style_J_total_updated.csv", char_columns)



 


Processing itemized: q_id=1306, type_of_upgrade=PTO_IF
Length of addendum_rows: 13
Length of original_rows: 14
Processing itemized: q_id=1306, type_of_upgrade=RNU
Length of addendum_rows: 4
Length of original_rows: 4
Processing itemized: q_id=1306, type_of_upgrade=ADNU
Length of addendum_rows: 3
Length of original_rows: 3
Processing itemized: q_id=1307, type_of_upgrade=PTO_IF
Length of addendum_rows: 13
Length of original_rows: 14
Processing itemized: q_id=1307, type_of_upgrade=RNU
Length of addendum_rows: 5
Length of original_rows: 5
Processing itemized: q_id=1307, type_of_upgrade=ADNU
Length of addendum_rows: 4
Length of original_rows: 3
Processing itemized: q_id=1308, type_of_upgrade=PTO_IF
Length of addendum_rows: 13
Length of original_rows: 0
Processing itemized: q_id=1308, type_of_upgrade=RNU
Length of addendum_rows: 5
Length of original_rows: 0
Processing itemized: q_id=1308, type_of_upgrade=ADNU
Length of addendum_rows: 5
Length of original_rows: 1
Processing itemized: q_id=130

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].replace("", pd.NA)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].combine_first(original_rows[col].reset_index(drop=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_rows[col] = addendum_rows[col].fillna(""