In [10]:
import os
import pdfplumber
import pandas as pd
import re
import PyPDF2
import traceback
import inflect

# Define paths and project range
BASE_DIRECTORY = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data"
OUTPUT_CSV_PATH_ORIGINAL = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/03_raw/rawdata_cluster1_4_style_R_originals.csv"
OUTPUT_CSV_PATH_ADDENDUM = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/03_raw/rawdata_cluster1_4_style_R_addendums.csv"
LOG_FILE_PATH = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/03_raw/scraping_cluster1_4_style_R_log.txt"
PROJECT_RANGE = range(488, 859)  # Example range for q_ids in Clusters 4

# Initialize DataFrames
core_originals = pd.DataFrame()
core_addendums = pd.DataFrame()

# Initialize tracking variables
scraped_projects = set()
skipped_projects = set()
missing_projects = set()
scraped_pdfs = []
skipped_pdfs = []
addendum_pdfs = []
original_pdfs = []
total_pdfs_accessed = 0
total_pdfs_scraped = 0
total_pdfs_skipped = 0

def clean_column_headers(headers):
    """Cleans column headers by normalizing, removing unwanted characters, and singularizing words."""
    p = inflect.engine()
    cleaned_headers = []
    for header in headers:
        if header is None:
            header = ""
        elif isinstance(header, str):
            header = header.lower()
            header = re.sub(r'\s+', ' ', header)
            header = re.sub(r'\(.*?\)', '', header)
            header = re.sub(r'[^a-zA-Z0-9\s]', '', header)
            header = header.strip()
            # Correct any mis‐spellings of “type of upgrade”
            header = re.sub(r'\btype of upgr\s*ade\b', 'type of upgrade', header)
            words = header.split()
            singular_words = [p.singular_noun(word) if p.singular_noun(word) else word for word in words]
            header = " ".join(singular_words)
        cleaned_headers.append(header)
    return cleaned_headers

def clean_string_cell(value):
    """Cleans string cells by removing newlines and trimming spaces."""
    if isinstance(value, str):
        return value.replace('\n', ' ').strip()
    return value

def contains_phrase(row, phrase):
    """Checks if any cell in a row contains a specific phrase."""
    regex_pattern = re.sub(r"\s+", r"\\s*", phrase)
    pattern = re.compile(regex_pattern, flags=re.IGNORECASE)
    return row.astype(str).apply(lambda cell: bool(pattern.search(cell))).any()

def extract_specific_phrase(title):
    """
    Extracts a specific phrase from the table title based on predefined keywords.
    """
    phrases = [
        "PTO",
        "Reliability Network Upgrade",
        "Area Delivery Network Upgrade",
        "Local Delivery Network",
        "ADNU",
        "LDNU",
        "RNU"
    ]
    for phrase in phrases:
        if re.search(rf"\b{re.escape(phrase)}\b", title, re.IGNORECASE):
            return phrase
    return title  # Fallback if none found

def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type of upgrade",
        "upgrade",
        "description",
        "cost allocation factor"
    ]
    existing_desired = [col for col in desired_order if col in df.columns]
    remaining = [col for col in df.columns if col not in existing_desired]
    new_order = existing_desired + remaining
    df = df[new_order]
    return df

def search_gps_coordinates(text, log_file):
    """Search for GPS coordinates using multiple patterns."""
    gps_coords = re.search(r"gps coordinates:\s*([\d\.\-]+),\s*([\d\.\-]+)", text, re.IGNORECASE)
    if gps_coords:
        print(f"Found GPS coordinates: {gps_coords.groups()}", file=log_file)
        return gps_coords.groups()
    project_coords = re.search(r"latitude[:\s]*([\d\.\-]+)[^\d]+longitude[:\s]*([\d\.\-]+)", text, re.IGNORECASE)
    if project_coords:
        print(f"Found project coordinates: {project_coords.groups()}", file=log_file)
        return project_coords.groups()
    gps_coords_directional = re.search(
        r"gps coordinates:\s*([\d\.\-]+)\s*[nNsS],\s*([\d\.\-]+)\s*[eEwW]", text, re.IGNORECASE)
    if gps_coords_directional:
        lat, lon = gps_coords_directional.groups()
        latitude = lat if "N" in text.upper() else f"-{lat}"
        longitude = lon if "E" in text.upper() else f"-{lon}"
        print(f"Found directional GPS coordinates: {(latitude, longitude)}", file=log_file)
        return (latitude, longitude)
    print("GPS coordinates not found.", file=log_file)
    return (None, None)

def adjust_rows_length(data_rows, headers):
    """
    Ensures each row in data_rows has exactly len(headers) columns.
    If a row is too short, it is padded with empty strings.
    If too long, it is truncated.
    """
    col_count = len(headers)
    for i in range(len(data_rows)):
        row = data_rows[i]
        if len(row) > col_count:
            data_rows[i] = row[:col_count]
        elif len(row) < col_count:
            data_rows[i].extend([""] * (col_count - len(row)))

def extract_table2(pdf_path, log_file):
    """
    Extracts the Point of Interconnection from Table 1 in the provided PDF.
    """
    print(f"\nProcessing {pdf_path} for Table 1 extraction...", file=log_file)
    point_of_interconnection = None
    poi_pattern = re.compile(r"Point\s+of\s+Interconnection", re.IGNORECASE)
    table_settings_list = [
        {"horizontal_strategy": "text", "vertical_strategy": "lines", "snap_tolerance": 1},
        {"horizontal_strategy": "lines", "vertical_strategy": "lines", "snap_tolerance": 2},
    ]
    try:
        with pdfplumber.open(pdf_path) as pdf:
            table1_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*(?:2|B\.1)\b", text, re.IGNORECASE):
                    table1_pages.append(i)
            if not table1_pages:
                print("No Table 2 found in the PDF.", file=log_file)
                return None
            first_page = table1_pages[0]
            last_page = table1_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1
            print(f"Table 2 starts on page {scrape_start + 1} and ends on page {scrape_end + 1}", file=log_file)
            extraction_successful = False
            for page_number in range(scrape_start, min(scrape_end + 1, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1} for Table 2...", file=log_file)
                for attempt, table_settings in enumerate(table_settings_list, start=1):
                    print(f"\nAttempt {attempt} with table settings: {table_settings}", file=log_file)
                    tables = page.find_tables(table_settings=table_settings)
                    print(f"Found {len(tables)} table(s) on page {page_number + 1} with current settings.", file=log_file)
                    for table_index, table in enumerate(tables, start=1):
                        tab = table.extract()
                        if not tab:
                            print(f"Table {table_index} on page {page_number + 1} is empty. Skipping.", file=log_file)
                            continue
                        print(f"\n--- Table {table_index} on Page {page_number + 1} ---", file=log_file)
                        for row_num, row in enumerate(tab, start=1):
                            print(f"Row {row_num}: {row}", file=log_file)
                        for row_index, row in enumerate(tab, start=1):
                            for cell_index, cell in enumerate(row, start=1):
                                if cell and poi_pattern.search(cell):
                                    poi_col_index = cell_index
                                    adjacent_col_index = poi_col_index + 1
                                    if adjacent_col_index <= len(row):
                                        poi_value = clean_string_cell(row[adjacent_col_index - 1])
                                        if poi_value:
                                            point_of_interconnection = poi_value
                                            print(f"\nFound Point of Interconnection: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index}, Row {row_index})", file=log_file)
                                            extraction_successful = True
                                            break
                                        else:
                                            print(f"\nPoint of Interconnection label found but adjacent value is empty (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                            poi_value_parts = []
                                            current_row_idx = row_index - 1
                                            start_scan = max(0, current_row_idx - 2)
                                            end_scan = min(len(tab), current_row_idx + 3)
                                            print(f"Scanning rows {start_scan + 1} to {end_scan} for POI value parts.", file=log_file)
                                            for scan_row_index in range(start_scan, end_scan):
                                                if scan_row_index == current_row_idx:
                                                    continue
                                                scan_row = tab[scan_row_index]
                                                if adjacent_col_index - 1 < len(scan_row):
                                                    scan_cell = clean_string_cell(scan_row[adjacent_col_index - 1])
                                                    if scan_cell and not poi_pattern.search(scan_cell):
                                                        poi_value_parts.append(scan_cell)
                                                        print(f"Found POI part in row {scan_row_index + 1}: '{scan_cell}'", file=log_file)
                                                    elif poi_pattern.search(scan_cell):
                                                        print(f"Encountered another POI label in row {scan_row_index + 1}. Skipping this row.", file=log_file)
                                                        continue
                                            if poi_value_parts:
                                                point_of_interconnection = " ".join(poi_value_parts)
                                                print(f"\nConcatenated Point of Interconnection: '{point_of_interconnection}' (Page {page_number + 1}, Table {table_index})", file=log_file)
                                                extraction_successful = True
                                                break
                                            else:
                                                print(f"\nNo POI value found in the surrounding rows (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                                    else:
                                        print(f"\nPoint of Interconnection label found but no adjacent column (Page {page_number + 1}, Table {table_index}, Row {row_index}).", file=log_file)
                            if extraction_successful:
                                break
                        if extraction_successful:
                            break
                    if extraction_successful:
                        break
                if extraction_successful:
                    break
    except Exception as e:
        print(f"Error processing Table 2 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return None
    if not extraction_successful:
        if point_of_interconnection is not None and point_of_interconnection != "":
            print("Point of Interconnection label found but no adjacent value.", file=log_file)
            return "Value Missing"
        else:
            print("Point of Interconnection not found in Table 2.", file=log_file)
            return None
    return point_of_interconnection

def fix_column_names(columns):
    """
    Renames duplicate and empty column names.
    Duplicate names are suffixed with _1, _2, etc.
    Empty or whitespace-only names are replaced with unnamed_1, unnamed_2, etc.
    """
    new_cols = []
    counts = {}
    unnamed_count = 1
    for col in columns:
        # Treat empty or whitespace-only names as unnamed.
        if not col or col.strip() == "":
            new_col = f"unnamed_{unnamed_count}"
            unnamed_count += 1
        else:
            new_col = col.strip()
        if new_col in counts:
            new_col_with_suffix = f"{new_col}_{counts[new_col]}"
            counts[new_col] += 1
            new_cols.append(new_col_with_suffix)
        else:
            counts[new_col] = 1
            new_cols.append(new_col)
    return new_cols

def post_process_columns(df, log_file):
    """
    Post-processes DataFrame column names:
      1. For any column named 'unnamed_#' (or empty), look at its first non-empty cell.
         If that cell is not a dollar amount (i.e. does not match /^\$\s*\d+(?:,\d{3})*(?:\.\d+)?$/)
         and it contains 2 or 3 words, then rename the column to that value (after cleaning).
         If a column already exists with that name, merge the data from the renamed column into the
         existing column and drop the renamed column.
      2. If a column is named "Needed For", then rename it to "description" (merging with an existing
         description column if necessary).
    """
    # Process unnamed columns.
    for col in list(df.columns):
        if col.lower().startswith("unnamed_") or col.strip() == "":
            # Find the first non-empty cell in this column.
            first_non_empty = None
            for val in df[col]:
                cell_val = ""
                if isinstance(val, str):
                    cell_val = val.strip()
                elif val is not None:
                    cell_val = str(val).strip()
                if cell_val:
                    first_non_empty = cell_val
                    break
            if first_non_empty:
                # Check if the value is a dollar amount.
                if not re.match(r"^\$\s*\d+(?:,\d{3})*(?:\.\d+)?$", first_non_empty):
                    words = first_non_empty.split()
                    if 2 <= len(words) <= 3:
                        # Clean the candidate name.
                        new_name = clean_column_headers([first_non_empty])[0]
                        log_file.write(f"Renaming column '{col}' to '{new_name}' based on first non-empty value '{first_non_empty}'.\n")
                        if new_name in df.columns and new_name != col:
                            # Merge the two columns: fill empty cells in existing new_name from the renamed col.
                            for idx in df.index:
                                existing_val = df.at[idx, new_name]
                                candidate_val = df.at[idx, col]
                                if (pd.isna(existing_val) or existing_val == "") and (not pd.isna(candidate_val) and candidate_val != ""):
                                    df.at[idx, new_name] = candidate_val
                            df.drop(columns=[col], inplace=True)
                        else:
                            df.rename(columns={col: new_name}, inplace=True)
    # Process "Needed For" column: rename or merge it into "description".
    if "Needed For" in df.columns:
        if "description" in df.columns:
            log_file.write("Merging 'Needed For' column into existing 'description' column.\n")
            for idx in df.index:
                desc_val = df.at[idx, "description"]
                needed_for_val = df.at[idx, "Needed For"]
                if (pd.isna(desc_val) or desc_val == "") and (not pd.isna(needed_for_val) and needed_for_val != ""):
                    df.at[idx, "description"] = needed_for_val
            df.drop(columns=["Needed For"], inplace=True)
        else:
            log_file.write("Renaming 'Needed For' column to 'description'.\n")
            df.rename(columns={"Needed For": "description"}, inplace=True)
    return df

def extract_table3(pdf_path, log_file, is_addendum=False):
    """
    Extracts Table 3 data   the provided PDF.
     
      2. Renaming of duplicate/empty columns (using fix_column_names) and then post-processing
         unnamed columns as described.
    """
    print(f"\nProcessing {pdf_path} for Table 10", file=log_file)
    extracted_tables = []
    specific_phrase = None
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Identify pages that contain either Table 3 patterns or Attachment 1/Attachment 2.
            table3_pages = []
            for i, page in enumerate(pdf.pages):
                text = page.extract_text() or ""
                if re.search(r"Table\s*11[-.]([1-3])\b", text, re.IGNORECASE):
                    #re.search(r"Attachment\s*[12]", text, re.IGNORECASE)):
                    table3_pages.append(i)
            if not table3_pages:
                print("No Table 10  found in the PDF.", file=log_file)
                return pd.DataFrame()
            first_page = table3_pages[0]
            last_page = table3_pages[-1]
            scrape_start = first_page
            scrape_end = last_page + 1
            print(f"Candidate pages start on {scrape_start + 1} and end on {scrape_end}", file=log_file)
            # Process each page that might contain table data.
            for page_number in range(scrape_start, min(scrape_end, len(pdf.pages))):
                page = pdf.pages[page_number]
                print(f"\nScraping tables on page {page_number + 1}...", file=log_file)
                # This variable keeps track of the bottom y-coordinate of the previous table on the page.
                previous_table_bottom = None
                tables = page.find_tables(table_settings={
                    "horizontal_strategy": "lines",
                    "vertical_strategy": "lines",
                })
                for table_index, table in enumerate(tables):
                    tab = table.extract()
                    if not tab:
                        print(f"Table {table_index + 1} on page {page_number + 1} is empty. Skipping.", file=log_file)
                        continue
                    table_bbox = table.bbox  # (x0, top, x1, bottom)
                    # Define the title region for the table: above the table bounding box.
                    title_bbox = (0, 0, page.width, table_bbox[1])
                    title_text = page.within_bbox(title_bbox).extract_text() or ""
                    table_title = None
                    if title_text:
                        title_lines = title_text.split('\n')[::-1]
                        for line in title_lines:
                            line = line.strip()
                            match = re.search(r"(Modification\s+of\s+)?Table\s*11[-.]?\d*[:\-\s]*(.*)", line, re.IGNORECASE)
                            if match:
                                table_title = match.group(2).strip()
                                break
                        
                  
                    # Extract the specific phrase using the refined table title.
                    if table_title:
                        specific_phrase = extract_specific_phrase(table_title)
                        print(f"New table detected: '{specific_phrase}' on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        headers = clean_column_headers(tab[0])
                        # Rename header 'type' to 'type of upgrade' if needed.
                        if "type" in headers and "type of upgrade" not in headers:
                            headers = [("type of upgrade" if h == "type" else h) for h in headers]
                        if "need for" in headers:
                            headers = [("description" if h == "need for" else h) for h in headers]  
                    
                        # Apply the duplicate/empty column fixing.
                        headers = fix_column_names(headers)
                        data_rows = tab[1:]
                        try:
                            df_new = pd.DataFrame(data_rows, columns=headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for new table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue

                        if "allocated" in df_new.columns:
                            df_new.drop(columns=["allocated"], inplace=True)
                            print(f"Dropped 'Max of' column in table on page {page_number + 1}, table {table_index}.", file=log_file)

                        if "cost rate x " in df_new.columns:
                            df_new.drop(columns=["cost rate x "], inplace=True)
                            print(f"Dropped 'cost rate x' column in table on page {page_number + 1}, table {table_index}.", file=log_file)

                        if "cost rate" in df_new.columns:
                            df_new.drop(columns=["cost rate"], inplace=True)
                            print(f"Dropped 'cost rate x' column in table on page {page_number + 1}, table {table_index}.", file=log_file) 

                        if "3339615 9" in df_new.columns:
                            df_new.drop(columns=["3339615 9"], inplace=True)
                            print(f"Dropped '3339615 9' column in table on page {page_number + 1}, table {table_index}.", file=log_file)     
                            
                        if "6 steady state reliability and posttransient voltage stability" in df_new.columns:
                            df_new.drop(columns=["6 steady state reliability and posttransient voltage stability"], inplace=True)
                            print(f"Dropped '6 steady state reliability and posttransient voltage stability' column in table on page {page_number + 1}, table {table_index}.", file=log_file)  



                        if "escalated" in df_new.columns:
                            df_new.drop(columns=["escalated"], inplace=True)
                            print(f"Dropped 'cost rate x' column in table on page {page_number + 1}, table {table_index}.", file=log_file)    


                        # Also, if the DataFrame has a column named "type" (and not already "type of upgrade"), rename it.
                        if 'type' in df_new.columns and 'type of upgrade' not in df_new.columns:
                            df_new.rename(columns={'type': 'type of upgrade'}, inplace=True)
                        # Special handling for ADNU tables if needed.
                        if re.search(r"Area\s*Delivery\s*Upgrades", specific_phrase, re.IGNORECASE):
                            print("Detected 'Area Delivery Network Upgrade' table (new).", file=log_file)
                            if "adnu" in df_new.columns:
                                if "type of upgrade" not in df_new.columns:
                                    adnu_values = df_new["adnu"].dropna().astype(str).tolist()
                                    grouped_adnu = " ".join(adnu_values)
                                    other_columns = df_new.drop(columns=["adnu"]).iloc[0].to_dict()
                                    df_grouped = pd.DataFrame({
                                        "upgrade": [grouped_adnu],
                                        "type of upgrade": [specific_phrase]
                                    })
                                    for col, value in other_columns.items():
                                        df_grouped[col] = value
                                    print("Grouped all 'adnu' rows into a single 'upgrade' row for new ADNU table.", file=log_file)
                                    df_new = df_grouped
                                else:
                                    if "upgrade" in df_new.columns:
                                        df_new.drop(columns=['adnu'], inplace=True)
                                        print("Dropped 'adnu' column to avoid duplicate 'upgrade'.", file=log_file)
                                    else:
                                        df_new.rename(columns={'adnu': 'upgrade'}, inplace=True)
                                        print("Renamed 'adnu' to 'upgrade' in new ADNU table.", file=log_file)
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new ADNU table.", file=log_file)
                            else:
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                                    print("Replaced None in 'type of upgrade' for new ADNU table.", file=log_file)
                        else:
                            if "type of upgrade" not in df_new.columns:
                                df_new["type of upgrade"] = specific_phrase
                                print("Added 'type of upgrade' to all rows in new non-ADNU table.", file=log_file)
                            else:
                                first_row = df_new.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print("Replacing None in 'type of upgrade' for new non-ADNU table.", file=log_file)
                                    df_new.at[0, "type of upgrade"] = specific_phrase
                        # Fix duplicate and unnamed columns in the new table.
                        df_new.columns = fix_column_names(df_new.columns.tolist())
                        # Now apply the post-processing of column names:
                        df_new = post_process_columns(df_new, log_file)
                        extracted_tables.append(df_new)
                    else:
                        # Continuation table branch.
                        if specific_phrase is None:
                            print(f"No previous table title found for continuation on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue
                        print(f"Continuation table detected on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        data_rows = tab
                        # Use the number of columns from the last extracted table as expected.
                        expected_columns = len(extracted_tables[-1].columns) if extracted_tables else None
                        if expected_columns is None:
                            print(f"No existing table to continue with on page {page_number + 1}, table {table_index + 1}. Skipping.", file=log_file)
                            continue
                        expected_headers = extracted_tables[-1].columns.tolist()
                        header_keywords = ["type of upgrade", "adnu"]
                        first_continuation_row = data_rows[0] if data_rows else []
                        is_header_row = any(
                            re.search(rf"\b{kw}\b", str(cell), re.IGNORECASE) for kw in header_keywords for cell in first_continuation_row
                        )
                        if is_header_row:
                            print(f"Detected header row in continuation table on page {page_number + 1}, table {table_index + 1}.", file=log_file)
                            data_rows = data_rows[1:]
                        # Ensure every row has the same length as expected_headers.
                        adjust_rows_length(data_rows, expected_headers)
                        try:
                            df_continuation = pd.DataFrame(data_rows, columns=expected_headers)
                        except ValueError as ve:
                            print(f"Error creating DataFrame for continuation table on page {page_number + 1}, table {table_index + 1}: {ve}", file=log_file)
                            continue
                        # Rename column 'type' if needed.
                        if 'type' in df_continuation.columns and 'type of upgrade' not in df_continuation.columns:
                            df_continuation.rename(columns={'type': 'type of upgrade'}, inplace=True)
                        if "need for" in df_continuation.columns:
                            df_continuation.rename(columns={"need for": "description"}, inplace=True)
                        if re.search(r"Area\s*Delivery\s*Upgrades", specific_phrase, re.IGNORECASE):
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing None in 'type of upgrade' for continuation ADNU table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation ADNU table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        else:
                            if "type of upgrade" in df_continuation.columns:
                                first_row = df_continuation.iloc[0]
                                if pd.isna(first_row["type of upgrade"]) or first_row["type of upgrade"] == "":
                                    print(f"Replacing None in 'type of upgrade' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                                    df_continuation.at[0, "type of upgrade"] = specific_phrase
                            else:
                                df_continuation["type of upgrade"] = specific_phrase
                                print(f"'type of upgrade' column added with value '{specific_phrase}' for continuation table on page {page_number + 1}, table {table_index + 1}", file=log_file)
                        # Fix duplicate and unnamed columns in the continuation table.
                        df_continuation.columns = fix_column_names(df_continuation.columns.tolist())
                        # Post-process the columns in the continuation table.
                        df_continuation = post_process_columns(df_continuation, log_file)
                        # Concatenate the continuation table with the previous extracted table.
                        extracted_tables[-1] = pd.concat([extracted_tables[-1], df_continuation], ignore_index=True, sort=False)
                    # Update the previous_table_bottom for the page using the current table's bbox.
                    previous_table_bottom = table_bbox[3]
    except Exception as e:
        print(f"Error processing Table 10 in {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()
    if extracted_tables:
        all_columns = set()
        for df in extracted_tables:
            all_columns.update(df.columns.tolist())
        standardized_tables = []
        for df in extracted_tables:
            standardized_df = df.reindex(columns=all_columns)
            standardized_tables.append(standardized_df)
        print("\nConcatenating all extracted Table 10/Attachment data...", file=log_file)
        try:
            table3_data = pd.concat(standardized_tables, ignore_index=True, sort=False)
            print(f"Successfully concatenated {len(standardized_tables)} tables.", file=log_file)
        except Exception as e:
            print(f"Error concatenating tables: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            table3_data = pd.DataFrame()
    else:
        print("No Table 10/Attachment data extracted.", file=log_file)
        table3_data = pd.DataFrame()
    return table3_data


def extract_table3_and_replace_none(pdf_path, project_id, log_file, is_addendum=False):
    """Extracts Table 10 data and merges with base data."""
    base_data = extract_base_data(pdf_path, project_id, log_file)
    table3_data = extract_table3(pdf_path, log_file, is_addendum)
    if table3_data.empty:
        return base_data
    else:
        overlapping_columns = base_data.columns.intersection(table3_data.columns).difference(['point_of_interconnection'])
        table3_data = table3_data.drop(columns=overlapping_columns, errors='ignore')
        base_data_repeated = pd.concat([base_data] * len(table3_data), ignore_index=True)
        try:

                        # Concatenate base data with Table 8 data along columns
            merged_df = pd.concat([base_data_repeated, table3_data], axis=1, sort=False)
           # if "q_id" in merged_df.columns and "type of upgrade" in merged_df.columns and "upgrade" in merged_df.columns:
           #     merged_df["is_duplicate"] = merged_df.duplicated(subset=["q_id", "type of upgrade", "upgrade"], keep="first")
            #    merged_df = merged_df[merged_df["is_duplicate"] == False].drop(columns=["is_duplicate"])
            #    print(f"Removed duplicate rows based on 'q_id', 'type of upgrade', and 'upgrade'.", file=log_file)


            if "q_id" in merged_df.columns and "type of upgrade" in merged_df.columns and "upgrade" in merged_df.columns:
                # Identify rows where 'type of upgrade' and 'upgrade' are not empty
                non_empty_rows = merged_df[
                    merged_df["type of upgrade"].notna() & merged_df["upgrade"].notna() &
                    (merged_df["type of upgrade"].str.strip() != "") & (merged_df["upgrade"].str.strip() != "")
                ]

                # Group by q_id, type of upgrade, and upgrade, keeping the first occurrence
                grouped_df = non_empty_rows.groupby(["q_id", "type of upgrade", "upgrade"], as_index=False).first()

                # Get the original order of the rows in merged_df before filtering
                merged_df["original_index"] = merged_df.index

                # Combine unique grouped rows with originally empty rows
                final_df = pd.concat([
                    grouped_df,
                    merged_df[merged_df["type of upgrade"].isna() | (merged_df["type of upgrade"].str.strip() == "") |
                            merged_df["upgrade"].isna() | (merged_df["upgrade"].str.strip() == "")]
                ], ignore_index=True, sort=False)

                # Restore the original order of the rows based on the saved index
                final_df.sort_values(by="original_index", inplace=True)
                final_df.drop(columns=["original_index"], inplace=True)
                merged_df = final_df

                print(f"Removed duplicate rows based on 'q_id', 'type of upgrade', and 'upgrade', excluding empty rows while preserving order.", file=log_file)

            merged_df = pd.concat([base_data_repeated, table3_data], axis=1, sort=False)
            if 'point_of_interconnection' not in merged_df.columns:
                merged_df['point_of_interconnection'] = base_data['point_of_interconnection'].iloc[0]
                print(f"Added 'point_of_interconnection' to merged data for {pdf_path}.", file=log_file)
            print(f"Merged base data with Table 3 data for {pdf_path}.", file=log_file)
            return merged_df
        except Exception as e:
            print(f"Error merging base data with Table 3 data for {pdf_path}: {e}", file=log_file)
            print(traceback.format_exc(), file=log_file)
            return base_data

def check_has_table3(pdf_path):
    """Checks if the PDF contains Table 3 or Attachment 1/Attachment 2."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text() or ""
                if re.search(r"(Modification\s+of\s+)?Table\s*11[-.]?\d*", text, re.IGNORECASE):
                    #re.search(r"Attachment\s*[12]", text, re.IGNORECASE))
                    return True
    except Exception as e:
        return False
    return False

def is_addendum(pdf_path):
    """Checks if the PDF is an addendum by searching 'Addendum' on the first page."""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            if len(pdf.pages) == 0:
                return False
            first_page = pdf.pages[0]
            text = first_page.extract_text() or ""
            return "Addendum" in text
    except Exception as e:
        return False

def extract_base_data(pdf_path, project_id, log_file):
    """Extract base data from the PDF and return as a DataFrame."""
    print("Extracting base data from PDF...", file=log_file)
    try:
        with open(pdf_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        text = clean_string_cell(text)
        #queue_id = re.search(r"q[\s_-]*(\d+)", text, re.IGNORECASE)
        queue_id = str(project_id) #queue_id.group(1) if queue_id else 
        print(f"Extracted Queue ID: {queue_id}", file=log_file)
        cluster_number = re.search(r"queue[\s_-]*cluster[\s_-]*(\d+)", text, re.IGNORECASE)
        cluster_number = cluster_number.group(1) if cluster_number else None
        print(f"Extracted Cluster Number: {cluster_number}", file=log_file)
        deliverability_status = re.search(r"(\w+)\s*capacity deliverability status", text, re.IGNORECASE)
        deliverability_status = deliverability_status.group(1) if deliverability_status else None
        print(f"Extracted Deliverability Status: {deliverability_status}", file=log_file)
        capacity = re.search(r"total rated output of (\d+)\s*mw", text, re.IGNORECASE)
        if capacity:
            capacity = int(capacity.group(1))
        else:
            capacity2 = re.search(r"(\d+)\s*mw", text)
            capacity = int(capacity2.group(1)) if capacity2 else None
        print(f"Extracted Capacity: {capacity}", file=log_file)
        point_of_interconnection = extract_table2(pdf_path, log_file)
        latitude, longitude = search_gps_coordinates(text, log_file)
        base_data = {
            "q_id": [queue_id],
            "cluster": [cluster_number],
            "req_deliverability": [deliverability_status],
            "latitude": [latitude],
            "longitude": [longitude],
            "capacity": [capacity],
            "point_of_interconnection": [point_of_interconnection]
        }
        print("Base data extracted:", file=log_file)
        print(base_data, file=log_file)
        return pd.DataFrame(base_data)
    except Exception as e:
        print(f"Error extracting base data from {pdf_path}: {e}", file=log_file)
        print(traceback.format_exc(), file=log_file)
        return pd.DataFrame()

def save_to_csv(df, output_csv_path, data_type):
    """Cleans the DataFrame and saves it to a CSV file."""
    if df.empty:
        print(f"No data to save for {data_type}.")
        return
    df = df.map(clean_string_cell)
    df = df[~df.apply(lambda row: contains_phrase(row, "Type of Upgrade"), axis=1)]
    df = reorder_columns(df)
    print(f"\nColumns reordered for {data_type} as per specification.")
    #if 'q_id' in df.columns:
    #    df['q_id'] = pd.to_numeric(df['q_id'], errors='coerce')
    try:
        df.to_csv(output_csv_path, index=False)
        print(f"\nData successfully saved to {output_csv_path}")
    except Exception as e:
        print(f"Error saving {data_type} data to CSV: {e}")
        print(traceback.format_exc())

'''
def process_pdfs_in_folder():
    """Processes all PDFs in the specified project range and directory."""
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
    with open(LOG_FILE_PATH, 'w') as log_file:
        for project_id in PROJECT_RANGE:
            project_path = os.path.join(BASE_DIRECTORY, str(project_id), "02_phase_1_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue
            project_scraped = False
            base_data_extracted = False
            base_data = pd.DataFrame()
            for pdf_name in os.listdir(project_path):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(project_path, pdf_name)
                    total_pdfs_accessed += 1
                    is_add = is_addendum(pdf_path)
                    if is_add:
                        addendum_pdfs.append(pdf_name)
                        print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                    else:
                        original_pdfs.append(pdf_name)
                        print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)
                    try:
                        has_table3 = check_has_table3(pdf_path)
                        if not has_table3:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 3 or Attachment data)", file=log_file)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 3 or Attachment data)")
                            total_pdfs_skipped += 1
                            continue
                        if not is_add and not base_data_extracted:
                            base_data = extract_base_data(pdf_path, project_id, log_file)
                            base_data_extracted = True
                            print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)
                        if is_add and base_data_extracted:
                            table3_data = extract_table3(pdf_path, log_file, is_addendum=is_add)
                            if not table3_data.empty:
                                merged_df = pd.concat([base_data] * len(table3_data), ignore_index=True)
                                merged_df = pd.concat([merged_df, table3_data], axis=1, sort=False)
                                core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                        else:
                            df = extract_table3_and_replace_none(pdf_path, project_id, log_file, is_addendum=is_add)
                            if not df.empty:
                                if is_add:
                                    core_addendums = pd.concat([core_addendums, df], ignore_index=True)
                                else:
                                    core_originals = pd.concat([core_originals, df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                    except Exception as e:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                        print(traceback.format_exc(), file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                        total_pdfs_skipped += 1
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")
    total_projects_processed = len(scraped_projects) + len(skipped_projects)
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")
    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))
    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))
    print("\nList of Missing Projects:")
    print(sorted(missing_projects))
    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)
    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)
    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)
    print("\nList of Original PDFs:")
    print(original_pdfs)
    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))
'''

def process_pdfs_in_folder():
    """
    Processes all PDFs in the directories within BASE_DIRECTORY whose numeric prefix is in PROJECT_RANGE.
    This allows folders like '641' and '641AA' (if 641 is in the PROJECT_RANGE) to be processed,
    and uses the full folder name (e.g. '641AA') as the project id (q_id).
    """
    global core_originals, core_addendums, total_pdfs_accessed, total_pdfs_scraped, total_pdfs_skipped
    os.makedirs(os.path.dirname(LOG_FILE_PATH), exist_ok=True)
 


    with open(LOG_FILE_PATH, 'w') as log_file:
        # List all subdirectories in BASE_DIRECTORY that have a numeric prefix.
        folders = [
            folder for folder in os.listdir(BASE_DIRECTORY)
            if os.path.isdir(os.path.join(BASE_DIRECTORY, folder)) and re.match(r'^(\d+)', folder)
        ]
    


        def sort_key(folder):
            match = re.match(r'^(\d+)', folder)
            if match:
                numeric = int(match.group(1))
                return (numeric, folder)
            return (float('inf'), folder)

        # Sort the folders in ascending order.
        sorted_folders = sorted(folders, key=sort_key)

        # Process each folder in sorted order.
        for folder in sorted_folders:
            folder_path = os.path.join(BASE_DIRECTORY, folder)
            match = re.match(r'^(\d+)', folder)
            if not match:
                continue  # Skip if there is no numeric prefix.
            numeric_part = int(match.group(1))
            # Process the folder only if its numeric part is in the desired range.
            if numeric_part not in PROJECT_RANGE:
                continue

            # Use the full folder name as the project identifier (q_id).
            project_id = folder  # e.g., "641AA" or "641"
            project_path = os.path.join(folder_path, "02_phase_1_study")
            if not os.path.exists(project_path):
                missing_projects.add(project_id)
                print(f"Project path does not exist: {project_path}", file=log_file)
                continue

            project_scraped = False
            base_data_extracted = False
            base_data = pd.DataFrame()
            for pdf_name in os.listdir(project_path):
                if pdf_name.endswith(".pdf"):
                    pdf_path = os.path.join(project_path, pdf_name)
                    total_pdfs_accessed += 1
                    is_add = is_addendum(pdf_path)
                    if is_add:
                        addendum_pdfs.append(pdf_name)
                        print(f"Accessing Addendum PDF: {pdf_name} from Project {project_id}", file=log_file)
                    else:
                        original_pdfs.append(pdf_name)
                        print(f"Accessing Original PDF: {pdf_name} from Project {project_id}", file=log_file)
                    try:
                        has_table3 = check_has_table3(pdf_path)
                        if not has_table3:
                            skipped_pdfs.append(pdf_name)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 3 or Attachment data)", file=log_file)
                            print(f"Skipped PDF: {pdf_name} from Project {project_id} (No Table 3 or Attachment data)")
                            total_pdfs_skipped += 1
                            continue
                        if not is_add and not base_data_extracted:
                            base_data = extract_base_data(pdf_path, project_id, log_file)
                            base_data_extracted = True
                            print(f"Extracted base data from original PDF: {pdf_name}", file=log_file)
                        if is_add and base_data_extracted:
                            table3_data = extract_table3(pdf_path, log_file, is_addendum=is_add)
                            if not table3_data.empty:
                                merged_df = pd.concat([base_data] * len(table3_data), ignore_index=True)
                                merged_df = pd.concat([merged_df, table3_data], axis=1, sort=False)
                                core_addendums = pd.concat([core_addendums, merged_df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped Addendum PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped Addendum PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                        else:
                            df = extract_table3_and_replace_none(pdf_path, project_id, log_file, is_addendum=is_add)
                            if not df.empty:
                                if is_add:
                                    core_addendums = pd.concat([core_addendums, df], ignore_index=True)
                                else:
                                    core_originals = pd.concat([core_originals, df], ignore_index=True)
                                scraped_pdfs.append(pdf_name)
                                scraped_projects.add(project_id)
                                project_scraped = True
                                total_pdfs_scraped += 1
                                print(f"Scraped PDF: {pdf_name} from Project {project_id}")
                            else:
                                skipped_pdfs.append(pdf_name)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)", file=log_file)
                                print(f"Skipped PDF: {pdf_name} from Project {project_id} (Empty Data)")
                                total_pdfs_skipped += 1
                    except Exception as e:
                        skipped_pdfs.append(pdf_name)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}", file=log_file)
                        print(traceback.format_exc(), file=log_file)
                        print(f"Skipped PDF: {pdf_name} from Project {project_id} due to an error: {e}")
                        total_pdfs_skipped += 1
            if not project_scraped and os.path.exists(project_path):
                skipped_projects.add(project_id)
    # Save results and print summary as before.
    save_to_csv(core_originals, OUTPUT_CSV_PATH_ORIGINAL, "originals")
    save_to_csv(core_addendums, OUTPUT_CSV_PATH_ADDENDUM, "addendums")
    total_projects_processed = len(scraped_projects) + len(skipped_projects)
    print("\n=== Scraping Summary ===")
    print(f"Total Projects Processed: {total_projects_processed}")
    print(f"Total Projects Scraped: {len(scraped_projects)}")
    print(f"Total Projects Skipped: {len(skipped_projects)}")
    print(f"Total Projects Missing: {len(missing_projects)}")
    print(f"Total PDFs Accessed: {total_pdfs_accessed}")
    print(f"Total PDFs Scraped: {total_pdfs_scraped}")
    print(f"Total PDFs Skipped: {total_pdfs_skipped}")
    print("\nList of Scraped Projects:")
    print(sorted(scraped_projects))
    print("\nList of Skipped Projects:")
    print(sorted(skipped_projects))
    print("\nList of Missing Projects:")
    print(sorted(missing_projects))
    print("\nList of Scraped PDFs:")
    print(scraped_pdfs)
    print("\nList of Skipped PDFs:")
    print(skipped_pdfs)
    print("\nList of Addendum PDFs:")
    print(addendum_pdfs)
    print("\nList of Original PDFs:")
    print(original_pdfs)
    print("\nNumber of Original PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in original_pdfs]))
    print("Number of Addendum PDFs Scraped:", len([pdf for pdf in scraped_pdfs if pdf in addendum_pdfs]))


 

def main():
    """Main function to execute the PDF scraping process."""
    process_pdfs_in_folder()

if __name__ == "__main__":
    main()


Scraped PDF: Appendix A - C493_06-25-2010_Final.pdf from Project 493
Skipped PDF: Q495_ Cluster 1_Report_Appendix A_final.pdf from Project 495 (No Table 3 or Attachment data)
Scraped PDF: Appendix A - C510_06-25-2010_Final.pdf from Project 510
Skipped PDF: QC4PI-SCE-Northern-Appendix A-DS-Q521 Columbia 1.pdf from Project 521 (No Table 3 or Attachment data)
Skipped PDF: QC4PI-SCE-Northern-Appendix A-DS-Q522 Columbia 2.pdf from Project 522 (No Table 3 or Attachment data)
Skipped PDF: 10AS666541-Appendix_A__Q557_Cluster2_Phase_I_study_report_rev1.pdf from Project 557 (No Table 3 or Attachment data)
Skipped PDF: Appendix A - Q557 Cluster2 Phase I study report.pdf from Project 557 (No Table 3 or Attachment data)
Skipped PDF: 10AS665861-Appendix_A__Q559_Cluster2_Phase_I_study_report_rev1.pdf from Project 559 (No Table 3 or Attachment data)
Skipped PDF: Appendix A - Q559 Cluster2 Phase I study report.pdf from Project 559 (No Table 3 or Attachment data)
Skipped PDF: Appendix A - Q560 Cluster2 

In [5]:
import os

# Update this to the path of your main folder
base_folder = r'/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data'  

# Set your numeric range
range_start = 488
range_end = 859

# Counter for folders with empty '02_phase_1_study'
empty_count = 0

# List to hold paths that meet the condition (optional)
empty_folders = []

# Loop through each item in the main folder
for item in os.listdir(base_folder):
    item_path = os.path.join(base_folder, item)
    if os.path.isdir(item_path):
        # Try to interpret the folder name as a number
        try:
            folder_number = int(item)
        except ValueError:
            continue  # Skip folders that don't have a numeric name

        # Check if the folder's number is in the desired range
        if range_start <= folder_number <= range_end:
            # Build the path for the '02_phase_1_study' subfolder
            phase1_path = os.path.join(item_path, "02_phase_1_study")
            if os.path.isdir(phase1_path):
                # Check if the subfolder is empty
                if not os.listdir(phase1_path):
                    empty_count += 1
                    empty_folders.append(item_path)
            else:
                # If you want to log folders missing the subfolder, you can print or handle that here.
                print(f"Folder {item_path} does not have a '02_phase_1_study' subfolder.")

print(f"\nTotal folders (from {range_start} to {range_end}) with an empty '02_phase_1_study' subfolder: {empty_count}")

# (Optional) List the folders that met the condition
if empty_folders:
    print("\nFolders with empty '02_phase_1_study':")
    for folder in empty_folders:
        print(folder)



Total folders (from 488 to 859) with an empty '02_phase_1_study' subfolder: 86

Folders with empty '02_phase_1_study':
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/764
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/790
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/739
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/706
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/730
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/502
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/791
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/736
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/503
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/700
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/686
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/681
/Users/vk365/Dropbox/Interconnections_data/data/pdf_

# counts projects with missing phase 1 and classifies them by cluster

In [9]:
import os
import re
import pandas as pd

# =====================================================
# User Settings and File Paths (update these as needed)
# =====================================================

# Base folder containing all project folders
base_folder = r'/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/03_data'

# Numeric range to consider (based on the numeric prefix of folder names)
range_start = 154   # (Update these values if needed)
range_end   = 2192

# Path to your CSV file (which must have columns "q_id" and "cluster_number")
phase_status_csv = r'/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/phase_status.csv'

# =====================================================
# Read CSV and Build Cluster Mapping
# =====================================================

# Read the CSV ensuring that the q_id is treated as a string
phase_status = pd.read_csv(phase_status_csv, dtype={'q_id': str})
# Create a mapping from q_id to cluster_number
cluster_mapping = dict(zip(phase_status['q_id'], phase_status['cluster_number']))

# =====================================================
# Initialize Data Structures for Results
# =====================================================

empty_phase1_count = 0          # Count of projects with an empty "02_phase_1_study"
empty_phase1_folders = []       # List of folder paths with an empty "02_phase_1_study" subfolder

# Dictionaries to group projects by cluster:
# 1. Projects missing Phase 1 (i.e. the folder is missing or exists but is empty)
missing_phase1_by_cluster = {}  

# 2. Projects that have an empty Phase 1 folder but have non-empty "03_phase_2_study" or "05_reassesment"
empty_phase1_with_following_by_cluster = {}  

# List to hold all project folder names (for final sorted list)
project_folders = []

# =====================================================
# Gather and Sort Candidate Folders
# =====================================================

# We use a list of tuples (folder_name, numeric_prefix) so we can sort by the numeric value
folder_candidates = []
for item in os.listdir(base_folder):
    item_path = os.path.join(base_folder, item)
    if os.path.isdir(item_path):
        # Use regex to extract the numeric prefix (works for names like "641" or "641AA")
        m = re.match(r'^(\d+)', item)
        if m:
            numeric_part = int(m.group(1))
            if range_start <= numeric_part <= range_end:
                folder_candidates.append((item, numeric_part))

# Sort candidates in ascending order first by the numeric prefix, then alphabetically by the full folder name
folder_candidates.sort(key=lambda x: (x[1], x[0]))

# =====================================================
# Process Each Candidate Folder
# =====================================================

for folder_name, num in folder_candidates:
    # Add folder name to final list (for reporting later)
    project_folders.append(folder_name)
    
    folder_path = os.path.join(base_folder, folder_name)
    # Build subfolder paths
    phase1_path = os.path.join(folder_path, "02_phase_1_study")
    phase2_path = os.path.join(folder_path, "03_phase_2_study")
    reassessment_path = os.path.join(folder_path, "05_reassesment")
    
    # Determine the cluster for this project using the CSV mapping; default to "Unknown" if not found.
    cluster = cluster_mapping.get(folder_name, "Unknown")
    
    # Check for "missing" Phase 1:
    # Here, we define "missing" as: either the Phase 1 folder does not exist OR it exists but is empty.
    phase1_missing = False
    if not os.path.isdir(phase1_path):
        phase1_missing = True
    else:
        # The folder exists; check if it is empty.
        if not os.listdir(phase1_path):
            phase1_missing = True
            empty_phase1_count += 1
            empty_phase1_folders.append(folder_path)
    
    if phase1_missing:
        missing_phase1_by_cluster.setdefault(cluster, []).append(folder_name)
    
        # Additionally, if the folder exists (but is empty) and at least one of the follow-up folders is non-empty,
        # then add it to the separate grouping.
        if os.path.isdir(phase1_path):  # only check follow-ups if the folder exists (even though it's empty)
            phase2_non_empty = os.path.isdir(phase2_path) and bool(os.listdir(phase2_path))
            reassessment_non_empty = os.path.isdir(reassessment_path) and bool(os.listdir(reassessment_path))
            if phase2_non_empty or reassessment_non_empty:
                empty_phase1_with_following_by_cluster.setdefault(cluster, []).append(folder_name)

# =====================================================
# Output the Results
# =====================================================

print(f"\nTotal projects (from {range_start} to {range_end}) with a missing or empty '02_phase_1_study' folder: {len(sum(missing_phase1_by_cluster.values(), []))}")

if empty_phase1_folders:
    print("\nFolders with an empty '02_phase_1_study' subfolder (full paths):")
    for folder in empty_phase1_folders:
        print(folder)

print("\n=== Projects Missing '02_phase_1_study' (Missing or Empty) Grouped by Cluster ===")
if missing_phase1_by_cluster:
    for clust, folders in missing_phase1_by_cluster.items():
        print(f"\nCluster {clust}:")
        for folder in folders:
            print(f"  {folder}")
else:
    print("No projects found missing the '02_phase_1_study' folder.")

print("\n=== Projects with Empty '02_phase_1_study' but with Non-Empty '03_phase_2_study' or '05_reassesment' (Grouped by Cluster) ===")
if empty_phase1_with_following_by_cluster:
    for clust, folders in empty_phase1_with_following_by_cluster.items():
        print(f"\nCluster {clust}:")
        for folder in folders:
            print(f"  {folder}")
else:
    print("No projects found with empty '02_phase_1_study' but non-empty subsequent phases.")

 

print("\n=== All Project Folder Names (Ascending Order) ===")
# Build a list of just the folder names
project_folders = [folder for folder, num in folder_candidates]

# Print the list in a format you can copy and paste
# Print the list with 10 items per line
print("Project folder names (ascending order):")
line_length = 10
for i in range(0, len(project_folders), line_length):
    line_items = project_folders[i:i+line_length]
    # Join each item wrapped in quotes and separated by a comma and space
    line = ", ".join(f"'{item}'" for item in line_items)
    print(line)

# =====================================================
# Print Total Counts by Cluster
# =====================================================

# Get all clusters that appear in either dictionary.
all_clusters = set(missing_phase1_by_cluster.keys()).union(set(empty_phase1_with_following_by_cluster.keys()))

print("\n=== Total Count by Cluster ===")
for clust in sorted(all_clusters, key=lambda x: str(x)):
    count_missing = len(missing_phase1_by_cluster.get(clust, []))
    count_following = len(empty_phase1_with_following_by_cluster.get(clust, []))
    print(f"Cluster {clust}:")
    print(f"  Missing or Empty '02_phase_1_study': {count_missing}")
    print(f"  Empty '02_phase_1_study' with Follow-up Data: {count_following}")



Total projects (from 154 to 2192) with a missing or empty '02_phase_1_study' folder: 121

Folders with an empty '02_phase_1_study' subfolder (full paths):
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/417
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/488
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/490
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/494
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/502
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/503
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/509
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/512
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/555
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/579
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/585
/Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/data/640
/Users/vk365/Dro

#creating itemized and total datasets

In [None]:
# 667 extracted phase 2 by mistake

In [58]:
import pandas as pd
import re
import unicodedata
import numpy as np
# Load the CSV file
df = pd.read_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/03_raw/rawdata_cluster1_4_style_R_originals.csv', dtype={'estimated_time_to_construct': str})

#df['q_id'] = df['q_id'].astype('Int64')
df['cluster'] = df['cluster'].astype('Int64')


######################################################################################################################################
########################################
# STEP 0: CREATE DESCRIPTION COLUMN FROM COST ALLOCATION FACTOR


def move_non_numeric_text(value):
    """Move non-numeric, non-percentage text from cost allocation factor to description.
       If a value is moved, return None for cost allocation factor."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return value  # Keep numeric or percentage values
        return None  # Clear the value if it's text (moved to description)
    return value  # Return as is for non-string values


def extract_non_numeric_text(value):
    """Extract non-numeric, non-percentage text from the cost allocation factor column."""
    if isinstance(value, str):  # Ensure it's a string before processing
        if re.fullmatch(r"[\d,.]+%?", value):  # Check if it's numeric or a percentage
            return None
        return value.strip()  # Return text entries as is
    return None  # Return None for non-string values



def clean_total_entries(value):
    """If the value starts with 'Total', remove numbers, commas, and percentage signs, keeping only 'Total'."""
    if isinstance(value, str) and value.startswith("Total"):
        return "Total"  # Keep only "Total"
    return value  # Leave other values unchanged

import re
import pandas as pd

def extract_cost_allocation(df, source_col, target_col="cost_allocation_factor"):
    """
    Extracts percentage values from a specified source column and moves them into a target column.
    
    - A percentage value is defined as a string that, when stripped of whitespace,
      fully matches a pattern of digits (with optional commas or periods) followed by a percent sign.
    - If a cell in the source column matches this pattern, its value is placed into the target column,
      and the source column cell is cleared (set to an empty string).
    - If the cell does not match a percentage pattern, it is left untouched in the source column.
    
    Parameters:
      df         : pandas DataFrame.
      source_col : string, the name of the column to scan for percentage values.
      target_col : string, the name of the column to store the extracted percentage values.
                   Defaults to "cost_allocation_factor".
    
    Returns:
      The DataFrame with the updated columns.
    """
    # Define a regex pattern to match a percentage value (e.g., "78.25%").
    # The pattern allows digits, commas, and periods, followed immediately by a "%" (ignoring leading/trailing spaces).
    pattern = r"^\s*[\d,\.]+%\s*$"
    
    def extract_percentage(text):
        # If text matches the percentage pattern, return the stripped text; otherwise, return None.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return text.strip()
        return None

    def clear_percentage(text):
        # If text matches the percentage pattern, clear it (return an empty string).
        # Otherwise, return the text stripped of surrounding whitespace.
        if isinstance(text, str) and re.fullmatch(pattern, text):
            return ""
        if isinstance(text, str):
            return text.strip()
        return text

    # Create (or overwrite) the target column with extracted percentage values from the source column.
    df[target_col] = df[source_col].apply(extract_percentage)
    # In the source column, remove any percentage values (leaving other text intact).
    df[source_col] = df[source_col].apply(clear_percentage)
    
    return df

 

 

def filter_numeric_costs(df, col):
    """
    For a given DataFrame and column name, this function extracts the numeric cost from each cell,
    converting values with an optional '$' sign (and possible commas) to floats.
    If a valid numeric cost cannot be extracted, the cell is set to NaN.
    
    Parameters:
      df  : pandas DataFrame.
      col : string, the name of the column to process.
      
    Returns:
      The original DataFrame with the specified column converted to numeric values (or NaN if conversion fails).
    """
    def extract_numeric(value):
        value_str = str(value)
        # This regex matches an optional '$', optional spaces, and a number with commas and an optional decimal part.
        match = re.search(r'\$?\s*([\d,]+(?:\.\d+)?)', value_str)
        if match:
            num_str = match.group(1).replace(',', '')
            try:
                return float(num_str)
            except ValueError:
                return np.nan
        return np.nan

    # Apply the extraction function to the specified column.
    df[col] = df[col].apply(extract_numeric)
    return df



def extract_months_values(df, col):
    """
    For a given DataFrame and column name, this function extracts text patterns matching
    durations expressed in months (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months").
    If a valid pattern is found, it returns the matched text; otherwise, it returns an empty string.
    
    Parameters:
        df  : pandas DataFrame.
        col : string, the name of the column to process.
        
    Returns:
        The DataFrame with the specified column updated.
    """
    def extract_months(text):
        text = str(text)
        # Pattern explanation:
        #   \d+          : one or more digits
        #   (?:-\d+)?    : optionally, a hyphen followed by one or more digits (to capture ranges like 21-29)
        #   \s*          : optional whitespace
        #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
        pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
        match = re.search(pattern, text)
        return match.group(1) if match else ""
    
    df[col] = df[col].apply(extract_months)
    return df

def move_months_values(df, source_col, target_col):
    """
    For a given DataFrame, this function extracts text patterns matching durations expressed in months
    (e.g., "32 months", "43 Month", "23 Months", "22Months", or "21-29months") from the source column,
    moves the extracted text to the target column, and removes it from the source column.
    
    Parameters:
        df         : pandas DataFrame.
        source_col : string, the name of the column to extract the month text from.
        target_col : string, the name of the column where the extracted month text will be moved.
        
    Returns:
        The updated DataFrame with the month values moved.
    """
    # Pattern explanation:
    #   \d+          : one or more digits
    #   (?:-\d+)?    : optionally, a hyphen and one or more digits (to capture ranges like 21-29)
    #   \s*          : optional whitespace
    #   [Mm]onths?   : "month" or "months" (case insensitive for the first letter)
    pattern = r'(\d+(?:-\d+)?\s*[Mm]onths?)'
    
    def process_text(text):
        text = str(text)
        match = re.search(pattern, text)
        if match:
            extracted = match.group(1)
            # Remove the extracted text from the source text and clean up extra spaces
            updated_text = re.sub(pattern, "", text).strip()
            return extracted, updated_text
        else:
            return "", text

    # Prepare lists to store the extracted month text and the updated source text
    extracted_vals = []
    updated_source_vals = []
    
    for val in df[source_col]:
        ext, updated = process_text(val)
        extracted_vals.append(ext)
        updated_source_vals.append(updated)
    
    # Create/update the target column with the extracted month text
    df[target_col] = extracted_vals
    # Replace the source column values with the text after removal of the month text
    df[source_col] = updated_source_vals
    
    return df



# Filter numeric costs in 'estimated_cost_x_1000' and 'escalated_cost_x_1000' columns
df = filter_numeric_costs(df, 'unnamed_11')

df = filter_numeric_costs(df, 'unnamed_10')

df = filter_numeric_costs(df, 'estimated')

df = filter_numeric_costs(df, 'estimated cost x')


df = extract_months_values(df, 'estimated_1')
#df = move_months_values(df, 'unnamed_13', 'estimated time to construct')





df['cost_allocation_factor']= None


df = extract_cost_allocation(df, "unnamed_8", "cost_allocation_factor")

df = extract_cost_allocation(df, "unnamed_9", "cost_allocation_factor")

# Create the 'description' column from 'cost allocation factor'
#if 'unnamed_9' in df.columns:
 #  df['unnamed_9'] = df['unnamed_9'].apply(extract_non_numeric_text)
  # df['unnamed_9'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values


#if 'unnamed_8' in df.columns:
 #  df['unnamed_8'] = df['unnamed_8'].apply(extract_non_numeric_text)
  # df['unnamed_8'] = df['cost_allocation_factor'].apply(move_non_numeric_text)  # Clear moved values   




######################################################################################################################################
########################################
#STEP 1 MERGE COLUMNS

def merge_columns(df):
    merge_columns_dict = {

        "upgrade": [
            "upgrade",
             "unnamed_4",
             "eastern area sp",
            ],

        "capacity": [
            "capacity",
            "MW",
            
        ],   

        "description": ["description",
                         "unnamed_6" , "unnamed_7"],

        "estimated_time_to_construct": [ 
            
            "estimated time to construct", "6 month", "unnamed_17",
                                         "84 month", "24 month", "48 month", "estimated_1","60 month","12 month", 
                                         "3648 month"
                                         ],

        "type_of_upgrade_2": ["unnamed_1", "delivery network upgrade", "reliability network upgrade",],



        "estimated_cost_x_1000": [
            

            "estimated cost x 1000",
            "estimated_cost",
            "estimated cost",
            "estimated",
            "estimated cost x",
            
            "estimated cost x 1000 constant dollar_1",
            "estimated cost x 1000 constant dollar",
            "estimated cost x 1000 constant",
            "estimated cost x 1000 constant dollar_1",
            "estimated cost x",
            "allocated_cost",
            "assigned cost",
            "allocated cost",
            "sum of allocated constant cost",
            "unnamed_13",
            "unnamed_11",
            "unnamed_10",

             
        ],    


        "escalated_cost_x_1000": [
            "escalated costs x 1000",
            
            "escalated cost x 1000 constant dollar",
            "estimated cost x 1000 escalated",
            "allocated cost escalated",
            "estimated cost x 1000 escalated without itcca",
            "escalated cost x 1000",
            "sum of allocated escalated cost",
            "assigned cost escalated",
            
             

        ],

         

        "total_estimated_cost_x_1000": [
            "total nu cost",
            "total cost constant"
        ],
        "total_estimated_cost_x_1000_escalated": [
            "total estimated cost x 1000 escalalted",
            "total estimated cost x 1000 escalated"
        ],
       
         

        

        
        "cost_allocation_factor": [
            "cost_allocation_factor",
            "cost allocation factor",
            "cost allocation",
            "cost allocatio n factor",
            "cost allocati on factor",
            "project allocation",
            "percent allocation",
           

        ],
       
    }

    # Identify unnamed columns
    unnamed_columns = [col for col in df.columns if pd.isna(col) or col.strip() == "" or col.startswith("Unnamed")]
    if unnamed_columns:
        merge_columns_dict["description"].extend(unnamed_columns)

    for new_col, old_cols in merge_columns_dict.items():
        existing_cols = [col for col in old_cols if col in df.columns]
        if existing_cols:
            df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
            cols_to_drop = [col for col in existing_cols if col != new_col]
            if cols_to_drop:
                df.drop(columns=cols_to_drop, inplace=True)

    return df

df = merge_columns(df)


######################################################################################################################################
########################################
# STEP 2: REMOVE DOLLAR SIGNED VALUES FROM 'estimated_time_to_construct'
######## Other clean up

def remove_dollar_values(value):
    """Remove dollar amounts (e.g., $3625.89, $3300) from 'estimated_time_to_construct'."""
    if isinstance(value, str) and re.search(r"^\$\d+(\.\d{1,2})?$", value.strip()):
        return None  # Replace with None if it's a dollar-signed number
    return value.strip() if isinstance(value, str) else value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(remove_dollar_values)


## Remove ranodm number in Total row:    
# Apply cleaning function to "upgrade" column after merging
#if 'upgrade' in df.columns:
 #   df['upgrade'] = df['upgrade'].apply(clean_total_entries)


 
 

    
######################################################################################################################################
########################################
# STEP 3: DROP UNNEEDED COLUMNS

#df.drop(['unnamed_3', 'unnamed_15', 'unnamed_18', 'unnamed_16', 'estimated cost x 1000 escalated with itcca'], axis=1, inplace=True, errors='ignore')

df.drop(['unnamed_3','unnamed_5', 'unnamed_8', 'unnamed_9','unnamed_12', 'unnamed_14', 'unnamed_15', 'unnamed_16','unnamed_18','unnamed_19' , 'see prior study','type of'], axis=1, inplace=True)



######################################################################################################################################
########################################
#STEP 4: NAMING CONVENTION
def convert_to_snake_case(column_name):
    column_name = column_name.strip().lower()
    column_name = re.sub(r'[\s\-]+', '_', column_name)
    column_name = re.sub(r'[^\w]', '', column_name)
    return column_name

def clean_string_cell(value):
    if isinstance(value, str):
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
        value = value.replace('\n', ' ').strip()
    return value

df = df.map(clean_string_cell)
df.columns = [convert_to_snake_case(col) for col in df.columns]



# Convert estimated_time_to_construct to integer (remove decimals) and keep NaNs as empty
#df['estimated_time_to_construct'] = pd.to_numeric(df['estimated_time_to_construct'], errors='coerce').apply(lambda x: int(x) if pd.notna(x) else None)


 
def process_dataframe(df):
    """
    Processes the DataFrame as follows:
    
    1. Drops any rows where any of these columns are empty or blank:
       - 'upgrade', 'description', 'cost_allocation_factor',
         'estimated_time_to_construct', 'type_of_upgrade_2', 'estimated_cost_x_1000'
    
    2. For each remaining row, if the value in 'type_of_upgrade' starts with
       'SCE', 'SDG&E', or 'PG&E' (or is empty after stripping),
       then the value in 'type_of_upgrade_2' is replaced with the value from 'type_of_upgrade'.
       
    Parameters:
        df: pandas DataFrame.
        
    Returns:
        A cleaned DataFrame with the above processing applied.
    """
    # Define the required columns
    required_cols = [
        "upgrade", "description", "cost_allocation_factor",
        "estimated_time_to_construct", "type_of_upgrade_2", "estimated_cost_x_1000"
    ]
    
       # Make a copy of the DataFrame to avoid modifying the original
    df_clean = df.copy()
    
    # Replace NaN with empty strings for checking emptiness
    df_clean[required_cols] = df_clean[required_cols].fillna("")

    # Convert all required columns to strings and strip whitespace
    df_clean[required_cols] = df_clean[required_cols].applymap(lambda x: str(x).strip())
    
    
 # Drop rows where all required columns are empty
    df_clean = df_clean[~(df_clean[required_cols].apply(lambda row: all(row == ""), axis=1))]
    
 
    
    # Define a function to update type_of_upgrade_2 if needed.
    def update_type(row):
        # Get the value from type_of_upgrade (converted to string and stripped)
        val = str(row.get("type_of_upgrade", "")).strip()
        # If the value is empty or starts with SCE, SDG&E, or PG&E, then update type_of_upgrade_2
        if val == "" or re.match(r'^(SCE|SDG&E|PG&E)', val):
            row["type_of_upgrade"] = row["type_of_upgrade_2"]
        return row


    # Apply the function row-wise
    df_clean = df_clean.apply(update_type, axis=1)
    
    return df_clean


df = process_dataframe(df)


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "type_of_upgrade_2",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_cost_x_1000",
        "escalated_cost_x_1000",
        "total_estimated_cost_x_1000",
        "total_estimated_cost_x_1000_escalated",
        "estimated_time_to_construct",
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df     



df = reorder_columns(df)

df= df[df['q_id']!= '667']
 

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].replace("", np.nan).ffill() 


df= df[df['type_of_upgrade']!= '12. Local Furnishing Bonds']
df= df[df['type_of_upgrade']!= '(when applicable):']




df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/03_raw/cluster_1_4_style_R.csv', index=False)

######################################################################################################################################
########################################
#STEP 5: REMOVING TOTAL ROW, AS THE PDFS GIVE TOTAL NETWORK COST RATHER THAN BY RNU, LDNU AS WE HAD BEFORE
# Remove rows where upgrade is "Total" (case-insensitive)

df['tot'] = df.apply(
    lambda row: 'yes' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) 
        ) else 'no',
    axis=1
) 

# Now extract ONLY "Total" rows with a foolproof match
total_rows_df = df[df['tot'] == 'yes']

total_rows_df = total_rows_df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()
total_rows_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_total_network.csv', index=False) 
df = df[df['type_of_upgrade'].str.strip().str.lower() != 'total']
df = df[df['type_of_upgrade'].str.strip().str.lower() != 'total cost']
 
df.drop('tot', axis=1, inplace= True)






######################################################################################################################################
########################################
# STEP 6: Move upgrade phrases like IRNU from upgrade column to a new column upgrade_classificatio and also replace type_of_upgrade with LDNU, CANU



# Define the list of phrases for upgrade classification
upgrade_phrases = ["IRNU", "GRNU", "CANU-D", "IRNU-A", "LDNU", "CANU-GR", "PNU", "CANU"]

 



#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)  


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )


mappings = {
    "PTO’s Interconnection Facilities (Note 2)": "PTO_IF",
    "PTO’s Interconnectio n Facilities (Note 2)": "PTO_IF",
    "PTOs Interconnection Facilities": "PTO_IF",
    "PTOs Interconnectio n Facilities": "PTO_IF",
 "PTO": 'PTO_IF',
 "PNU": "OPNU",
 "Delivery Network Upgrades": "LDNU",
 "Delivery Network": "ADNU",
 "Plan of Service Reliability Network Upgrades": "RNU",
 "Distribution Upgrades": "LDNU",
 "PG&E Reliability Network Upgrades": "RNU",
 "SDG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Upgrades": "LDNU",
 "SCE Distribution Upgrades": "LDNU",
 "SCE Reliability Network Upgrades for Short Circuit duty": "RNU",
 "SCE Network Upgrades": "RNU",
 "Plan of Service Distribution Upgrades": "LDNU",
 "PG&E Delivery Network Upgrades": "LDNU",
 "SCE Delivery Network Upgrades": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 - Continued": "LDNU",
 "Upgrades, Estimated Costs, and Estimated Time to Construct Summary for C565 -": "LDNU",
 "Reliability Network Upgrades to Physically Interconnect": "RNU",

 "Reliability Network Upgrades": "RNU",
    "Local Delivery Network Upgrades": "LDNU",
    "Area Deliverability Upgrades": "ADNU",
    "Escalated Cost and Time to Construct for Interconnection Facilities, Reliability Network Upgrades, and Delivery Network Upgrades": "LDNU",
    "Distribution": "ADNU",
'Total PTO_IF': 'PTO_IF',
 'Total RNU': 'RNU',
 'Total LDNU': 'LDNU',
 'Total OPNU' : 'OPNU',
 'Total CANU': 'CANU',
 'Total LOPNU': 'LOPNU',
 'Total ADNU': 'ADNU',
}





if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].ffill()    






######################################################################################################################################
########################################
#STEP 7: Stable sort type of upgrade

def stable_sort_by_type_of_upgrade(df):
    """Performs a stable sort within each q_id to order type_of_upgrade while preserving row order in other columns."""
    
    # Define the custom sorting order for type_of_upgrade
    type_order = {"PTO_IF": 1, "RNU": 2, "LDNU": 3, "PNU": 4, "ADNU": 5}

    # Assign a numerical sorting key; use a high number if type_of_upgrade is missing
    df['sort_key'] = df['type_of_upgrade'].map(lambda x: type_order.get(x, 99))

    # Perform a stable sort by q_id first, then by type_of_upgrade using the custom order
    df = df.sort_values(by=['q_id', 'sort_key'], kind='stable').drop(columns=['sort_key'])

    return df

# Apply stable sorting
  


df = df.groupby(['q_id', 'type_of_upgrade', 'upgrade'], as_index=False).first()


df= reorder_columns(df)

#df = stable_sort_by_type_of_upgrade(df)  
#df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 14/03_raw/cluster_14_style_Q.csv', index=False)
######################################################################################################################################
########################################
# STEP 8: Remove $ signs and convert to numeric
 

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries
    




# Clean the specific columns
for col in ['estimated_cost_x_1000',]:
    if col in df.columns:
        df[col] = df[col].apply(clean_currency)






######################################################################################################################################
########################################
# STEP 9: Create Total rows


df['item'] = df.apply(
    lambda row: 'no' if (
        (pd.notna(row.get('type_of_upgrade')) and 'Total' in str(row['type_of_upgrade'])) or
        (pd.notna(row.get('cost_allocation_factor')) and 'Total' in str(row['cost_allocation_factor']))
    ) else 'yes',
    axis=1
)


  

# Create Total rows
new_rows = []
columns_to_sum = ['estimated_cost_x_1000']
columns_to_populate = ['cluster', 'req_deliverability', 'latitude', 'longitude', 'capacity', 'point_of_interconnection']

for q_id, group in df.groupby('q_id', as_index=False):
    print(f"\nProcessing q_id: {q_id}")  # Debug print
    unique_upgrades = group['type_of_upgrade'].dropna().unique()
    for upgrade in unique_upgrades:
        if pd.isna(upgrade):
            continue
        
        rows = group[group['type_of_upgrade'] == upgrade]

        # Debug: Print current group
        print(f"\nChecking Upgrade: {upgrade}, Total Rows Present?:", 
              ( (group['item'] == 'no')).any())

        # Check if a Total row already exists for this (q_id, upgrade)
        total_exists = ((group['item'] == 'no')).any()
        
        if total_exists:
            print(f"Skipping Total row for {upgrade} (already exists).")
            continue
        
        total_row = {col: '' for col in df.columns}  # Initialize all columns as empty strings
        total_row['q_id'] = q_id
        total_row['type_of_upgrade'] = f"Total {upgrade}"
        total_row['item'] = 'no'

        # Populate specified columns from the existing row
        first_row = rows.iloc[0]
        for col in columns_to_populate:
            if col in df.columns:
                total_row[col] = first_row[col]

        # Sum the numeric columns
        for col in columns_to_sum:
            if col in rows.columns:
                total_row[col] = rows[col].sum()
            else:
                total_row[col] = 0  # Default to 0 if column is missing

        print(f"Creating Total row for {upgrade}")  # Debug print
        new_rows.append(total_row)

# Convert list to DataFrame and append
if new_rows:
    total_rows_df = pd.DataFrame(new_rows)
    print("\nNew Total Rows Created:\n", total_rows_df)  # Debug print
    df = pd.concat([df, total_rows_df], ignore_index=True)

df.reset_index(drop=True, inplace=True)


if 'type_of_upgrade' in df.columns:
  
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: mappings.get(x, x) if isinstance(x, str) else x
    )


df = stable_sort_by_type_of_upgrade(df)

 


#: Move 'item' column next to 'type_of_upgrade'
if 'item' in df.columns and 'type_of_upgrade' in df.columns:
    cols = df.columns.tolist()
    item_index = cols.index('item')
    type_index = cols.index('type_of_upgrade')
    if item_index < type_index:
        cols.insert(type_index + 1, cols.pop(item_index))
    else:
        cols.insert(type_index + 1, cols.pop(item_index))
    df = df[cols]




#  Remove "Total" values from cost_allocation_factor if they appear in type_of_upgrade
if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if (
            pd.notna(row['type_of_upgrade']) and 'Total' in str(row['type_of_upgrade'])
        ) else row.get('cost_allocation_factor'),
        axis=1
    )
    

if 'cost_allocation_factor' in df.columns and 'type_of_upgrade' in df.columns:
    df['cost_allocation_factor'] = df.apply(
        lambda row: None if 'Total' in str(row.get('cost_allocation_factor', '')) else row.get('cost_allocation_factor'),
        axis=1
    )






def clean_estimated_time(value):
    """
    Removes the word 'month' or 'months' (case insensitive) from the value.
    Leaves behind any numbers or number ranges (e.g. "6", "6-12").
    """
    if isinstance(value, str):
        # Remove 'month' or 'months' (case-insensitive), optionally with spaces around them.
        cleaned_value = re.sub(r'(?i)\s*months?\s*', '', value)
        
        return cleaned_value.strip()
    return value





# Then apply it to your column, for example with Pandas:
df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)


def clean_estimated_time(value):
    if isinstance(value, str):
         
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
    return value

if 'estimated_time_to_construct' in df.columns:
    df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_estimated_time)

if 'type_of_upgrade' in df.columns:
    df['type_of_upgrade'] = df['type_of_upgrade'].apply(
        lambda x: re.sub(r'\s*\(note \d+\)', '', x, flags=re.IGNORECASE).strip() if isinstance(x, str) else x
    )

if 'type_of_upgrade' in df.columns:
    previous_type_of_upgrade = None
    for i in range(len(df)):
        if df.at[i, 'type_of_upgrade'] == 'Total':
            if previous_type_of_upgrade is not None:
                df.at[i, 'type_of_upgrade'] = previous_type_of_upgrade
        else:
            previous_type_of_upgrade = df.at[i, 'type_of_upgrade']

numeric_columns = [
    'cost_allocation_factor',
    'estimated_cost_x_1000',
    'estimated_time_to_construct',
    'total_estimated_cost_x_1000_escalated',
    'adnu_cost_rate_x_1000',
    'escalated_cost_x_1000',
    'estimated_cost_x_1000_escalated_without_itcca',
    'adnu_cost_rate_x_1000_escalated'
]
non_numeric_columns = ['type_of_upgrade', 'upgrade', 'description']

for col in non_numeric_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 'None' if pd.isna(x) else x)

for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = df[col].fillna(0)

if 'original_order' in df.columns and 'q_id' in df.columns:
    df['original_order'] = df.index
    df = df.sort_values(by=['q_id', 'original_order'], ascending=[True, True])
    df = df.drop(columns=['original_order'])


if 'upgrade' in df.columns:
    df['upgrade'] = df['upgrade'].ffill()      


df.drop('type_of_upgrade_2', axis=1, inplace=True, errors='ignore') 

#df= reorder_columns(df)


# Save itemized and totals separately
if 'item' in df.columns:
    itemized_df = df[df['item'] == 'yes']
    itemized_df = itemized_df.drop_duplicates(subset=['q_id', 'type_of_upgrade', 'upgrade'])
    itemized_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_itemized.csv', index=False)

    totals_columns = ['upgrade', 'description', 'cost_allocation_factor', 'estimated_time_to_construct']
    totals_df = df[df['item'] == 'no'].drop(columns=totals_columns, errors='ignore')
    #totals_df = totals_df.drop_duplicates(subset=['q_id', 'type_of_upgrade', 'upgrade'])
    totals_df.to_csv('/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_total.csv', index=False)

print(f"Itemized rows saved to 'costs_phase_1_cluster_5_style_D_itemized.csv'.")
print(f"Filtered Total rows saved to 'costs_phase_1_cluster_5_style_D_total.csv'.")



if 'type_of_upgrade' in df.columns:
    print(df['type_of_upgrade'].unique())

if 'q_id' in df.columns:
    print(df['q_id'].unique())

if 'cluster' in df.columns:
    print(df['cluster'].unique())


#df.to_csv('Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 14/03_raw/rawdata_cluster14_style_Q.csv')
#


  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]



Processing q_id: 493

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 510

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 561

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU, Total Rows Present?: False
Creating Total row for RNU

Processing q_id: 565

Checking Upgrade: LDNU, Total Rows Present?: False
Creating Total row for LDNU

Checking Upgrade: PTO_IF, Total Rows Present?: False
Creating Total row for PTO_IF

Checking Upgrade: RNU

  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df[new_col] = df[existing_cols].bfill(axis=1).iloc[:, 0]
  df_clean[required_cols] = df_clean[required_cols].applymap(lambda x: str(x).strip())
  df[col] = df[col].fillna(0)


# Checking Scraped data

In [63]:
import pandas as pd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# ---------------------- Configuration ---------------------- #

# Paths to the CSV files
ITEMIZED_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_itemized.csv'
TOTALS_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_total.csv'

# Columns in totals_df that hold the reported total costs
TOTALS_ESTIMATED_COLUMN = 'estimated_cost_x_1000'
TOTALS_ESCALATED_COLUMN = 'estimated_cost_x_1000'

# Upgrade types to check
REQUIRED_UPGRADES = ['PTO_IF', 'RNU', 'LDNU', 'ADNU']

# Output paths
MISMATCHES_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/mismatches.csv'
MATCHED_QIDS_CSV_PATH = '/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_1_cost_data/Cluster 3/matched_qids.csv'

# ---------------------- Load Data ---------------------- #

def load_csv(path, dataset_name):
    """
    Loads a CSV file into a pandas DataFrame.
    """
    try:
        df = pd.read_csv(path)
        print(f"Loaded {dataset_name} from {path}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {path}")
        exit(1)
    except Exception as e:
        print(f"Error loading {dataset_name}: {e}")
        exit(1)

# Load datasets
itemized_df = load_csv(ITEMIZED_CSV_PATH, "itemized data")
totals_df = load_csv(TOTALS_CSV_PATH, "totals data")

# ---------------------- Data Cleaning ---------------------- #

def clean_text(df, column):
    """
    Cleans text data by stripping leading/trailing spaces and converting to uppercase.
    """
    if column in df.columns:
        df[column] = df[column].astype(str).str.strip().str.upper()
    else:
        print(f"Warning: '{column}' column is missing in the dataset. Filling with 'UNKNOWN'.")
        df[column] = 'UNKNOWN'
    return df

# Clean 'type_of_upgrade' and 'point_of_interconnection' in both datasets
itemized_df = clean_text(itemized_df, 'type_of_upgrade')
itemized_df = clean_text(itemized_df, 'point_of_interconnection')

totals_df = clean_text(totals_df, 'type_of_upgrade')
totals_df = clean_text(totals_df, 'point_of_interconnection')

# ---------------------- Data Preparation ---------------------- #

# Ensure necessary columns exist in itemized_df
required_itemized_columns = ['q_id', 'type_of_upgrade', 'point_of_interconnection', 'estimated_cost_x_1000', 'escalated_cost_x_1000']
for col in required_itemized_columns:
    if col not in itemized_df.columns:
        print(f"Warning: '{col}' column is missing in the itemized dataset.")
        if col in ['q_id', 'type_of_upgrade', 'point_of_interconnection']:
            itemized_df[col] = 'UNKNOWN'
        else:
            itemized_df[col] = 0

# Ensure necessary columns exist in totals_df
required_totals_columns = ['q_id', 'type_of_upgrade', 'point_of_interconnection', TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]
for col in required_totals_columns:
    if col not in totals_df.columns:
        print(f"Error: '{col}' column is missing in the totals dataset. Cannot proceed.")
        exit(1)

# Convert cost columns to numeric, coercing errors to NaN and filling with 0
cost_columns_itemized = ['estimated_cost_x_1000', 'escalated_cost_x_1000']
for col in cost_columns_itemized:
    itemized_df[col] = pd.to_numeric(itemized_df[col], errors='coerce').fillna(0)

cost_columns_totals = [TOTALS_ESTIMATED_COLUMN, TOTALS_ESCALATED_COLUMN]
for col in cost_columns_totals:
    totals_df[col] = pd.to_numeric(totals_df[col], errors='coerce').fillna(0)

# ---------------------- Calculate Manual Totals ---------------------- #

# Group itemized data by q_id and type_of_upgrade and calculate sums
itemized_grouped = itemized_df.groupby(['q_id', 'type_of_upgrade']).agg({
    'estimated_cost_x_1000': 'sum',
    'escalated_cost_x_1000': 'sum'
}).reset_index()

# Apply preference: Use estimated_cost_x_1000 if sum > 0, else use escalated_cost_x_1000
itemized_grouped['manual_total'] = itemized_grouped.apply(
    lambda row: row['estimated_cost_x_1000'] if row['estimated_cost_x_1000'] > 0 else row['escalated_cost_x_1000'],
    axis=1
)

# ---------------------- Prepare Totals Data ---------------------- #

# Group totals data by q_id and type_of_upgrade and calculate sums
totals_grouped = totals_df.groupby(['q_id', 'type_of_upgrade']).agg({
    TOTALS_ESTIMATED_COLUMN: 'sum',
    TOTALS_ESCALATED_COLUMN: 'sum'
}).reset_index()

# Apply preference: Use estimated_cost_x_1000 if sum > 0, else use escalated_cost_x_1000
totals_grouped['reported_total'] = totals_grouped.apply(
    lambda row: row[TOTALS_ESTIMATED_COLUMN] if row[TOTALS_ESTIMATED_COLUMN] > 0 else row[TOTALS_ESCALATED_COLUMN],
    axis=1
)

# ---------------------- Merge Data ---------------------- #

# Merge the itemized and totals data on q_id and type_of_upgrade
comparison_df = pd.merge(
    itemized_grouped,
    totals_grouped[['q_id', 'type_of_upgrade', 'reported_total']],
    on=['q_id', 'type_of_upgrade'],
    how='left'
)

# ---------------------- Check for Missing Upgrades ---------------------- #

# Identify q_ids that are missing any of the required upgrades
missing_upgrades_report = []
for q_id in comparison_df['q_id'].unique():
    upgrades_present = comparison_df[comparison_df['q_id'] == q_id]['type_of_upgrade'].unique()
    missing_upgrades = [upgrade for upgrade in REQUIRED_UPGRADES if upgrade not in upgrades_present]
    if missing_upgrades:
        missing_upgrades_report.append((q_id, missing_upgrades))

# Report missing upgrades
if missing_upgrades_report:
    print("\nQ_ids with missing upgrades:")
    for q_id, missing in missing_upgrades_report:
        print(f"  Q_id {q_id} is missing upgrades: {', '.join(missing)}")
else:
    print("\nAll q_ids have all required upgrades.")


# ------------------------ Check for no duplicates in type of upgrade in total data ------------------------ #

 

# Identify duplicates by grouping by q_id and type_of_upgrade
duplicates = totals_df[totals_df.duplicated(subset=['q_id', 'type_of_upgrade'], keep=False)]

if not duplicates.empty:
    print("\nDuplicate upgrade types detected:")
    for q_id, group in duplicates.groupby('q_id'):
        upgrade_types = group['type_of_upgrade'].unique()
        print(f"  Q_id {q_id} has repeated upgrade types: {', '.join(upgrade_types)}")
else:
    print("\nNo type of upgrade is repeated for any Q_id.")    

# ---------------------- Compare Totals and Identify Mismatches ---------------------- #

# Initialize list to store mismatches
mismatches = []

# Iterate through each row to compare manual_total with reported_total
for index, row in comparison_df.iterrows():
    q_id = row['q_id']
    upgrade = row['type_of_upgrade']
    manual_total = row['manual_total']
    reported_total = row['reported_total']
    
    # Determine if both manual_total and reported_total are zero
    if manual_total == 0.0 and reported_total == 0.0:
        continue  # No mismatch
    # Determine if manual_total is zero and reported_total is missing or zero
    elif manual_total == 0.0 and (pd.isna(row['reported_total']) or reported_total == 0.0):
        continue  # No mismatch
    # If reported_total is missing (NaN) and manual_total is not zero
    elif pd.isna(row['reported_total']) and manual_total != 0.0:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: Missing")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': 'Missing'
        })
    # If manual_total is not zero and reported_total is zero
    elif manual_total != 0.0 and reported_total == 0.0:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: 0.0")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': reported_total
        })
    # If both totals are non-zero but differ beyond tolerance
    elif abs(manual_total - reported_total) > 1e+1:
        print(f"Mismatch: Q_id {q_id}, Upgrade '{upgrade}' - Manual Total: {manual_total}, Reported Total: {reported_total}")
        mismatches.append({
            'q_id': q_id,
            'type_of_upgrade': upgrade,
            'manual_total': manual_total,
            'reported_total': reported_total
        })
    # Else, totals match; do nothing

# Create a DataFrame for mismatches
mismatches_df = pd.DataFrame(mismatches, columns=['q_id', 'type_of_upgrade', 'manual_total', 'reported_total'])

# Save mismatches to a CSV file
try:
    mismatches_df.to_csv(MISMATCHES_CSV_PATH, index=False)
    print(f"\nMismatches saved to '{MISMATCHES_CSV_PATH}'.")
except Exception as e:
    print(f"Error saving mismatches CSV: {e}")

# ---------------------- Point of Interconnection Matching ---------------------- #

# Extract unique q_id and point_of_interconnection from itemized dataset
itemized_poi = itemized_df[['q_id', 'point_of_interconnection']].drop_duplicates()

# Extract unique q_id and point_of_interconnection from totals dataset
totals_poi = totals_df[['q_id', 'point_of_interconnection']].drop_duplicates()

# Merge both to have a complete list of q_id and point_of_interconnection
all_poi = pd.concat([itemized_poi, totals_poi]).drop_duplicates().reset_index(drop=True)

# ---------------------- Direct Match Identification ---------------------- #

# Group by point_of_interconnection to find q_ids sharing the same point_of_interconnection
direct_matches = all_poi.groupby('point_of_interconnection')['q_id'].apply(list).reset_index()

# Filter groups with more than one q_id (i.e., shared points_of_interconnection)
direct_matches = direct_matches[direct_matches['q_id'].apply(len) > 1]

print("\nDirect Matches (Exact Point of Interconnection Names):")
if not direct_matches.empty:
    print(direct_matches)
else:
    print("No direct matches found.")

# ---------------------- Fuzzy Match Identification ---------------------- #

# Prepare list of points_of_interconnection for fuzzy matching
poi_list = all_poi['point_of_interconnection'].unique().tolist()

# Initialize list to store fuzzy matches
fuzzy_matches = []

# Iterate through each point_of_interconnection to find similar ones
for i, poi in enumerate(poi_list):
    # Compare with the rest of the points to avoid redundant comparisons
    similar_pois = process.extract(poi, poi_list[i+1:], scorer=fuzz.token_set_ratio)
    
    # Filter matches with similarity >= 80%
    for match_poi, score in similar_pois:
        if score >= 80:
            # Retrieve q_ids for both points_of_interconnection
            qids_poi1 = all_poi[all_poi['point_of_interconnection'] == poi]['q_id'].tolist()
            qids_poi2 = all_poi[all_poi['point_of_interconnection'] == match_poi]['q_id'].tolist()
            
            # Append the matched pairs with their points_of_interconnection and similarity score
            fuzzy_matches.append({
                'point_of_interconnection_1': poi,
                'q_ids_1': qids_poi1,
                'point_of_interconnection_2': match_poi,
                'q_ids_2': qids_poi2,
                'similarity_score': score
            })

# Convert fuzzy matches to DataFrame
fuzzy_matches_df = pd.DataFrame(fuzzy_matches)

print("\nFuzzy Matches (>=80% Similarity in Point of Interconnection):")
if not fuzzy_matches_df.empty:
    print(fuzzy_matches_df)
else:
    print("No fuzzy matches found.")

# ---------------------- Save Matched Q_ids to CSV ---------------------- #

# For clarity, create a combined DataFrame for direct and fuzzy matches

# Direct matches: list each pair of q_ids sharing the same point_of_interconnection
direct_matches_expanded = []
for _, row in direct_matches.iterrows():
    qids = row['q_id']
    poi = row['point_of_interconnection']
    # Generate all possible unique pairs
    for i in range(len(qids)):
        for j in range(i+1, len(qids)):
            direct_matches_expanded.append({
                'match_type': 'Direct',
                'point_of_interconnection_1': poi,
                'q_id_1': qids[i],
                'point_of_interconnection_2': poi,
                'q_id_2': qids[j],
                'similarity_score': 100
            })

# Fuzzy matches: already have pairs
fuzzy_matches_expanded = []
for _, row in fuzzy_matches_df.iterrows():
    fuzzy_matches_expanded.append({
        'match_type': 'Fuzzy',
        'point_of_interconnection_1': row['point_of_interconnection_1'],
        'q_id_1': row['q_ids_1'],
        'point_of_interconnection_2': row['point_of_interconnection_2'],
        'q_id_2': row['q_ids_2'],
        'similarity_score': row['similarity_score']
    })

# Convert to DataFrame
matched_qids_df = pd.DataFrame(direct_matches_expanded + fuzzy_matches_expanded)

# Save matched q_ids to CSV
try:
    matched_qids_df.to_csv(MATCHED_QIDS_CSV_PATH, index=False)
    print(f"Matched Q_ids saved to '{MATCHED_QIDS_CSV_PATH}'.")
except Exception as e:
    print(f"Error saving matched Q_ids CSV: {e}")

# ---------------------- Summary ---------------------- #

# Print a summary
total_checked = len(comparison_df)
total_mismatches = len(mismatches_df)
print(f"\nTotal checks performed: {total_checked}")
print(f"Total mismatches found: {total_mismatches}")


Loaded itemized data from /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_itemized.csv
Loaded totals data from /Users/vk365/Dropbox/Interconnections_data/data/pdf_scraper/output/Cluster 3/02_intermediate/costs_phase_1_cluster_1_4_style_R_total.csv

Q_ids with missing upgrades:
  Q_id 493 is missing upgrades: ADNU
  Q_id 510 is missing upgrades: ADNU
  Q_id 561 is missing upgrades: ADNU
  Q_id 565 is missing upgrades: ADNU
  Q_id 574 is missing upgrades: ADNU
  Q_id 583 is missing upgrades: ADNU
  Q_id 590 is missing upgrades: ADNU
  Q_id 608 is missing upgrades: ADNU
  Q_id 628 is missing upgrades: ADNU
  Q_id 643AM is missing upgrades: ADNU
  Q_id 643AP is missing upgrades: ADNU
  Q_id 643T is missing upgrades: ADNU
  Q_id 649B is missing upgrades: ADNU
  Q_id 649C is missing upgrades: ADNU
  Q_id 650AA is missing upgrades: ADNU
  Q_id 651A is missing upgrades: ADNU
  Q_id 653ED is missing upgrades: LDNU, A