In [9]:
import os
import pandas as pd
import re


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.

    Args:
        df (pd.DataFrame): The DataFrame to reorder.

    Returns:
        pd.DataFrame: The reordered DataFrame.
    """
    desired_order = [
        "q_id",
        "original",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_time_to_construct",
        "estimated_cost",
        "escalated_cost",
        "total_estimated_cost",
        "total_escalated_cost",
        
    ]

    # Start with desired columns that exist in the DataFrame
    existing_desired = [col for col in desired_order if col in df.columns]

    # Then add the remaining columns
    remaining = [col for col in df.columns if col not in existing_desired]

    # Combine the two lists
    new_order = existing_desired + remaining

    # Reorder the DataFrame
    df = df[new_order]

    return df  


def replace_text_with_zero(value):
    """
    Cleans a value by removing percentage symbols, converting numeric values, and replacing text with zero.
    Ignores NA values and retains original values if conversion fails.

    Args:
        value (str/int/float): The value to be processed.

    Returns:
        int/float/str: Cleaned numeric value, original value if conversion fails, or NA if value is missing.
    """
    if pd.isna(value):  # Ignore NA values and return as is
        return value  

    if isinstance(value, str):
        value = value.strip()

        # Remove percentage symbols and convert to a numeric value
        if value.endswith('%'):
            value = value.replace('%', '')  # Remove % symbol

        # If value contains alphabetic characters, consider it as non-numeric and return 0
        if re.search(r'[a-zA-Z]', value):
            return 0
        
 

    try:
        return pd.to_numeric(value, errors='coerce') if value != '' else value
    except (ValueError, TypeError):
        return value  # Keep the original value if conversion fails




columns_to_clean = ['cost_allocation_factor', 'estimated_cost']        


def clean_text(value):
    """
    Cleans a string by explicitly removing unwanted characters and patterns,
    such as '$', '*', and text like '6months' while keeping numeric ranges (e.g., '6-24') intact.
    
    Args:
        value (str): The value to be cleaned.
    
    Returns:
        float/int/str: Cleaned numeric value or original string if numeric patterns are detected.
    """
    if isinstance(value, str):
        # Remove unwanted characters like $, * and "(Note 2)" references
        
     
        
        # Replace "6months", "12months" etc., while preserving numeric ranges like "6-24"
        value = re.sub(r'(\d+)months', r'\1', value, flags=re.IGNORECASE)

    try:
        return pd.to_numeric(value)  # Convert to numeric type where possible
    except ValueError:
        return value  # Return the cleaned string if conversion fails

def clean_currency(value):
    """
    Cleans a string by explicitly removing $, *, (Note 2), and similar patterns,
    then converts it to a numeric value.
    """
    if isinstance(value, str):
        # Explicitly remove $, *, and any "(Note ...)"
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)  # Remove patterns like "(Note 2)"
        #value = re.sub(r'months','', value)
        value = value.replace(',', '').strip()  # Remove commas and extra spaces
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA  # Return NaN for invalid entries


# Clean the specific columns


def create_addendum_list_and_mark_original(root_folder, output_folder):
    """
    Collects all q_ids from itemized_addendums files in each cluster folder, creates an addendum_projects_list.csv,
    and adds an 'original' column to the combined itemized and total datasets based on the presence in the addendum list.

    Args:
        root_folder (str): Path to the root folder containing cluster folders.
        output_folder (str): Path to save the addendum_projects_list.csv and combined datasets.
    """
    addendum_qids = []

    # Step 1: Collect all q_ids from itemized_addendums files
    for cluster_folder in os.listdir(root_folder):
        cluster_path = os.path.join(root_folder, cluster_folder)
        if os.path.isdir(cluster_path):  # Ensure it's a directory
            intermediate_folder = os.path.join(cluster_path, "02_intermediate")
            if os.path.exists(intermediate_folder):  # Check if 02_intermediate exists
                for file_name in os.listdir(intermediate_folder):
                    if "itemized_addendums.csv" in file_name:
                        file_path = os.path.join(intermediate_folder, file_name)
                        print(f"Processing: {file_path}")
                        df = pd.read_csv(file_path)
                        if 'q_id' in df.columns:
                            addendum_qids.extend(df['q_id'].dropna().astype(int).unique())  # Convert to integers

    # Remove duplicates from addendum_qids and sort them
    addendum_qids = sorted(set(addendum_qids))

    # Save addendum_projects_list.csv
    addendum_list_df = pd.DataFrame({'q_id': addendum_qids})
    addendum_list_file = os.path.join(output_folder, "addendum_projects_list.csv")
    addendum_list_df.to_csv(addendum_list_file, index=False)
    print(f"Addendum projects list saved to: {addendum_list_file}")
    print(f"Total number of projects in addendum_projects_list: {len(addendum_qids)}")

    # Step 2: Combine all itemized_updated and total_updated files
    combined_itemized = []
    combined_total = []

    for cluster_folder in os.listdir(root_folder):
        cluster_path = os.path.join(root_folder, cluster_folder)
        if os.path.isdir(cluster_path):  # Ensure it's a directory
            clean_folder = os.path.join(cluster_path, "01_clean")
            if os.path.exists(clean_folder):  # Check if 01_clean exists
                for file_name in os.listdir(clean_folder):
                    file_path = os.path.join(clean_folder, file_name)
                    if "itemized_updated.csv" in file_name:
                        print(f"Loading: {file_path}")
                        itemized_df = pd.read_csv(file_path)
                        combined_itemized.append(itemized_df)
                    elif "total_updated.csv" in file_name:
                        print(f"Loading: {file_path}")
                        total_df = pd.read_csv(file_path)
                        combined_total.append(total_df)



                        

    # Combine all itemized datasets
    if combined_itemized:
        combined_itemized_df = pd.concat(combined_itemized, ignore_index=True)

        # Convert q_id to integers
        combined_itemized_df['q_id'] = combined_itemized_df['q_id'].fillna(0).astype(str)
        

        # Add 'original' column to itemized dataset
        combined_itemized_df['original'] = combined_itemized_df['q_id'].apply(
            lambda qid: 'no' if qid in addendum_qids else 'yes'
        )

        # Reorder columns to place 'original' next to 'q_id'
        if 'q_id' in combined_itemized_df.columns and 'original' in combined_itemized_df.columns:
            cols = list(combined_itemized_df.columns)
            cols.insert(cols.index('q_id') + 1, cols.pop(cols.index('original')))
            combined_itemized_df = combined_itemized_df[cols]

        # Sort by q_id
        combined_itemized_df.drop(['estimate_d_time_to_construc_t','estimated_cost_x_1000_escalated_with_itcca' ,'adnu_cost_rate_x_1000_escalated',  'potential_duration_months',
                                   'none_7', 'none_8', 'network_upgrade_type', 'adnu_cost_rate_escalated_x_1000','upgrade_classification', 'sum_of_reallocated_share',	'sum_of_reallocated_cost_x_1000_constant_dollar_2022',
                                       	'sum_of_reallocated_costs_x_1000_escalated_constant_dollars_od_year',
                                   'ttype_of_upgrade','adnu_cost_rate_x_1000'], axis=1, errors='ignore', inplace=True)
        combined_itemized_df.rename(columns={'estimated_cost_x_1000': 'estimated_cost', 'escalated_cost_x_1000': 'escalated_cost', 'total_estimated_cost_x_1000': 'total_estimated_cost',
                                             'total_estimated_cost_x_1000_escalated':'total_escalated_cost',}, inplace=True)
        
        for col in ['estimated_cost', 'escalated_cost', 'total_estimated_cost', 'total_escalated_cost', ]:
            if col in combined_itemized_df.columns:
                combined_itemized_df[col] = combined_itemized_df[col].apply(clean_currency)


        for col in ['estimated_time_to_construct']:
            if col in combined_itemized_df.columns:
                combined_itemized_df[col]=combined_itemized_df[col].apply(clean_text)   

        for col in columns_to_clean:
            if col in combined_itemized_df.columns:
                combined_itemized_df[col] = combined_itemized_df[col].apply(replace_text_with_zero)      
 
        
        combined_itemized_df=reorder_columns(combined_itemized_df)


        combined_itemized_df = combined_itemized_df.sort_values(by="q_id", kind="stable").reset_index(drop=True)

        # Save the combined itemized dataset
        combined_itemized_file = os.path.join(output_folder, "costs_phase_2_all_clusters_itemized.csv")
        combined_itemized_df.to_csv(combined_itemized_file, index=False)
        print(f"Combined itemized dataset with 'original' column saved to: {combined_itemized_file}")

    # Combine all total datasets
    if combined_total:
        combined_total_df = pd.concat(combined_total, ignore_index=True)

        # Convert q_id to integers
        combined_total_df['q_id'] = combined_total_df['q_id'].fillna(0).astype(str)

        # Add 'original' column to total dataset
        combined_total_df['original'] = combined_total_df['q_id'].apply(
            lambda qid: 'no' if qid in addendum_qids else 'yes'
        )

        # Reorder columns to place 'original' next to 'q_id'
        if 'q_id' in combined_total_df.columns and 'original' in combined_total_df.columns:
            cols = list(combined_total_df.columns)
            cols.insert(cols.index('q_id') + 1, cols.pop(cols.index('original')))
            combined_total_df = combined_total_df[cols]

        combined_total_df.drop(['estimate_d_time_to_construc_t','estimated_cost_x_1000_escalated_with_itcca' ,'adnu_cost_rate_x_1000_escalated',  'potential_duration_months',
                                   'none_7', 'none_8', 'network_upgrade_type', 'adnu_cost_rate_escalated_x_1000','upgrade_classification', 'sum_of_reallocated_share',	'sum_of_reallocated_cost_x_1000_constant_dollar_2022',
                                       	'sum_of_reallocated_costs_x_1000_escalated_constant_dollars_od_year',
                                   'ttype_of_upgrade','adnu_cost_rate_x_1000'], errors='ignore', axis=1, inplace=True)
        combined_total_df.rename(columns={'estimated_cost_x_1000': 'estimated_cost', 'escalated_cost_x_1000': 'escalated_cost', 'total_estimated_cost_x_1000': 'total_estimated_cost',
                                             'total_estimated_cost_x_1000_escalated':'total_escalated_cost',}, inplace=True)
        
        for col in ['estimated_cost', 'escalated_cost', 'total_estimated_cost', 'total_escalated_cost', ]:
            if col in combined_total_df.columns:
                combined_total_df[col] = combined_total_df[col].apply(clean_currency)

        for col in ['estimated_time_to_construct']:
            if col in combined_total_df.columns:
                combined_total_df[col]=combined_total_df[col].apply(clean_text)  
                
        for col in columns_to_clean:
            if col in combined_total_df.columns:
                combined_total_df[col] = combined_total_df[col].apply(replace_text_with_zero)      

         

        # Columns to clean by converting text to 0
        #columns_to_clean = ['cost_allocation_factor', 'estimated_cost']

        # Convert non-numeric values to 0
        #combined_total_df[columns_to_clean] = combined_total_df[columns_to_clean].apply(pd.to_numeric, errors='coerce').fillna(0)        
        
 
        
        combined_total_df=reorder_columns(combined_total_df)    

        # Sort by q_id
        combined_total_df = combined_total_df.sort_values(by="q_id", kind="stable").reset_index(drop=True)

        # Save the combined total dataset
        combined_total_file = os.path.join(output_folder, "costs_phase_2_all_clusters_total.csv")
        combined_total_df.to_csv(combined_total_file, index=False)
        print(f"Combined total dataset with 'original' column saved to: {combined_total_file}")

# Define the root folder and output folder
root_folder = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/"  # Update with your root folder path
output_folder = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/all_clusters"  # Update with your output folder path

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Run the function
create_addendum_list_and_mark_original(root_folder, output_folder)


Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 7/02_intermediate/costs_phase_2_cluster_7_style_Q_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_Q_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_others_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_N_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_sty

ValueError: invalid literal for int() with base 10: '643W'

# Allows for non numeric q_ids

In [2]:
import os
import pandas as pd
import re


def reorder_columns(df):
    """
    Reorders the columns of the DataFrame based on the specified order.
    """
    desired_order = [
        "q_id",
        "original",
        "cluster",
        "req_deliverability",
        "latitude",
        "longitude",
        "capacity",
        "point_of_interconnection",
        "type_of_upgrade",
        "upgrade",
        "description",
        "cost_allocation_factor",
        "estimated_time_to_construct",
        "estimated_cost",
        "escalated_cost",
        "total_estimated_cost",
        "total_escalated_cost",
    ]

    existing_desired = [col for col in desired_order if col in df.columns]
    remaining = [col for col in df.columns if col not in existing_desired]
    df = df[existing_desired + remaining]
    return df  


def replace_text_with_zero(value):
    """
    Cleans a value by removing percentage symbols, converting numeric values,
    and replacing text with zero.
    """
    if pd.isna(value):
        return value  
    if isinstance(value, str):
        value = value.strip()
        if value.endswith('%'):
            value = value.replace('%', '')
        if re.search(r'[a-zA-Z]', value):
            return 0
    try:
        return pd.to_numeric(value, errors='coerce') if value != '' else value
    except (ValueError, TypeError):
        return value


columns_to_clean = ['cost_allocation_factor', 'estimated_cost']        


def clean_text(value):
    """
    Cleans a string by removing unwanted patterns while preserving numeric ranges.
    """
    if isinstance(value, str):
        value = re.sub(r'(\d+)months', r'\1', value, flags=re.IGNORECASE)
    try:
        return pd.to_numeric(value)
    except ValueError:
        return value


def clean_currency(value):
    """
    Removes $, *, (Note X), commas, then converts to numeric.
    """
    if isinstance(value, str):
        value = value.replace('$', '').replace('*', '')
        value = re.sub(r'\(Note \d+\)', '', value)
        value = value.replace(',', '').strip()
    try:
        return pd.to_numeric(value)
    except ValueError:
        return pd.NA


def create_addendum_list_and_mark_original(root_folder, output_folder):
    """
    Collects all q_ids from itemized_addendums, writes a master list,
    then combines and cleans both itemized and total files per cluster.
    """
    addendum_qids = []

    # Step 1: collect addendum q_ids as strings (don't cast to int)
    for cluster_folder in os.listdir(root_folder):
        cluster_path = os.path.join(root_folder, cluster_folder)
        if os.path.isdir(cluster_path):
            interm = os.path.join(cluster_path, "02_intermediate")
            if os.path.exists(interm):
                for fn in os.listdir(interm):
                    if "itemized_addendums.csv" in fn:
                        path = os.path.join(interm, fn)
                        print(f"Processing: {path}")
                        df = pd.read_csv(path)
                        if 'q_id' in df.columns:
                            addendum_qids.extend(df['q_id'].dropna().astype(str).unique())

    addendum_qids = sorted(set(addendum_qids))
    pd.DataFrame({'q_id': addendum_qids})\
      .to_csv(os.path.join(output_folder, "addendum_projects_list.csv"), index=False)
    print(f"Total in addendum list: {len(addendum_qids)}")

    # Step 2: load itemized and total files
    combined_itemized = []
    combined_total = []
    for cluster_folder in os.listdir(root_folder):
        cluster_path = os.path.join(root_folder, cluster_folder)
        if os.path.isdir(cluster_path):
            clean_folder = os.path.join(cluster_path, "01_clean")
            if os.path.exists(clean_folder):
                for fn in os.listdir(clean_folder):
                    path = os.path.join(clean_folder, fn)
                    if "itemized_updated.csv" in fn:
                        print(f"Loading: {path}")
                        combined_itemized.append(pd.read_csv(path))
                    elif "total_updated.csv" in fn:
                        print(f"Loading: {path}")
                        combined_total.append(pd.read_csv(path))

    # Combine itemized
    if combined_itemized:
        df = pd.concat(combined_itemized, ignore_index=True)
        df['q_id'] = df['q_id'].fillna(0).astype(str)
        df['original'] = df['q_id'].apply(lambda q: 'no' if q in addendum_qids else 'yes')

        cols = list(df.columns)
        cols.insert(cols.index('q_id')+1, cols.pop(cols.index('original')))
        df = df[cols]
        df.drop([
            'estimate_d_time_to_construc_t','estimated_cost_x_1000_escalated_with_itcca',
            'adnu_cost_rate_x_1000_escalated','potential_duration_months',
            'none_7','none_8','network_upgrade_type',
            'adnu_cost_rate_escalated_x_1000','upgrade_classification',
            'sum_of_reallocated_share','sum_of_reallocated_cost_x_1000_constant_dollar_2022',
            'sum_of_reallocated_costs_x_1000_escalated_constant_dollars_od_year',
            'ttype_of_upgrade','adnu_cost_rate_x_1000',
            'project_size_mw',
            'sum_of_reallocated_cost_x_1000_constant_dollar_2024',	'sum_ofreallocated_costs_x_1000_escalated_constant_dollars_od_year',	
            'unnamed_15',
            'other_potential_network_upgrades', 'upgrade_classification_grnu_irnu',
        ], axis=1, errors='ignore', inplace=True)
        df.rename(columns={
            'estimated_cost_x_1000': 'estimated_cost',
            'escalated_cost_x_1000': 'escalated_cost',
            'total_estimated_cost_x_1000': 'total_estimated_cost',
            'total_estimated_cost_x_1000_escalated':'total_escalated_cost',
        }, inplace=True)

        for c in ['estimated_cost','escalated_cost','total_estimated_cost','total_escalated_cost']:
            if c in df: df[c] = df[c].apply(clean_currency)
        if 'estimated_time_to_construct' in df:
            df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_text)
            df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(replace_text_with_zero)
        for c in columns_to_clean:
            if c in df: df[c] = df[c].apply(replace_text_with_zero)

        df = reorder_columns(df)

        # → sort by numeric part of q_id then full string
        df['q_id_num'] = (
            df['q_id']
              .str.extract(r'(\d+)', expand=False)
              .astype(float)
              .fillna(0)
              .astype(int)
        )
        df = (
            df
              .sort_values(['q_id_num', 'q_id'], kind='stable')
              .drop(columns='q_id_num')
              .reset_index(drop=True)
        )

        out1 = os.path.join(output_folder, "costs_phase_2_all_clusters_itemized.csv")
        df.to_csv(out1, index=False)
        print(f"Saved itemized to: {out1}")

    # Combine total
    if combined_total:
        df = pd.concat(combined_total, ignore_index=True)
        df['q_id'] = df['q_id'].fillna(0).astype(str)
        df['original'] = df['q_id'].apply(lambda q: 'no' if q in addendum_qids else 'yes')

        cols = list(df.columns)
        cols.insert(cols.index('q_id')+1, cols.pop(cols.index('original')))
        df = df[cols]
        df.drop([
            'estimate_d_time_to_construc_t','estimated_cost_x_1000_escalated_with_itcca',
            'adnu_cost_rate_x_1000_escalated','potential_duration_months',
            'none_7','none_8','network_upgrade_type',
            'adnu_cost_rate_escalated_x_1000','upgrade_classification',
            'sum_of_reallocated_share','sum_of_reallocated_cost_x_1000_constant_dollar_2022',
            'sum_of_reallocated_costs_x_1000_escalated_constant_dollars_od_year',
            'ttype_of_upgrade','adnu_cost_rate_x_1000',
                        'project_size_mw',
            'sum_of_reallocated_cost_x_1000_constant_dollar_2024',	'sum_ofreallocated_costs_x_1000_escalated_constant_dollars_od_year',	
            'unnamed_15',
            'other_potential_network_upgrades', 'upgrade_classification_grnu_irnu',
        ], axis=1, errors='ignore', inplace=True)
        df.rename(columns={
            'estimated_cost_x_1000': 'estimated_cost',
            'escalated_cost_x_1000': 'escalated_cost',
            'total_estimated_cost_x_1000': 'total_estimated_cost',
            'total_estimated_cost_x_1000_escalated':'total_escalated_cost',
        }, inplace=True)

        for c in ['estimated_cost','escalated_cost','total_estimated_cost','total_escalated_cost']:
            if c in df: df[c] = df[c].apply(clean_currency)
        if 'estimated_time_to_construct' in df:
            df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(clean_text)
            df['estimated_time_to_construct'] = df['estimated_time_to_construct'].apply(replace_text_with_zero)
        for c in columns_to_clean:
            if c in df: df[c] = df[c].apply(replace_text_with_zero)

        df = reorder_columns(df)

        # → sort by numeric part of q_id then full string
        df['q_id_num'] = (
            df['q_id']
              .str.extract(r'(\d+)', expand=False)
              .astype(float)
              .fillna(0)
              .astype(int)
        )
        df = (
            df
              .sort_values(['q_id_num', 'q_id'], kind='stable')
              .drop(columns='q_id_num')
              .reset_index(drop=True)
        )

        out2 = os.path.join(output_folder, "costs_phase_2_all_clusters_total.csv")
        df.to_csv(out2, index=False)
        print(f"Saved total to: {out2}")


# Define folders & run
root_folder = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/"
output_folder = "/Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/all_clusters"
os.makedirs(output_folder, exist_ok=True)
create_addendum_list_and_mark_original(root_folder, output_folder)


Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 7/02_intermediate/costs_phase_2_cluster_7_style_Q_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_Q_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_others_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 13/02_intermediate/costs_phase_2_cluster_13_style_N_itemized_addendums.csv
Processing: /Users/vk365/Dropbox/Interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data/Cluster 14/02_intermediate/costs_phase_2_cluster_14_sty