<a href="https://colab.research.google.com/github/victorialovefranklin/Toward-Climate-Resilient-Energy-Systems/blob/main/EAGLE_I_Data_Integration_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **EAGLE-I (2014-2023) Data Integration & Cleaning**

## **Task:** Load the 10 files Environment for Analysis of Geo-Located Energy Information (EAGLE-I) datasets across 2014 to 2023. Export clean files with seperationof State fips and county fips columns.

In [12]:
# ==============================
# EAGLE-I (2014–2023) — Data Integration & Cleaning
# ==============================

import os
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

# -------- Explicit file list (each year, 2014–2023) --------
YEAR_TO_PATH = {
    2014: "/content/eaglei_outages_2014.csv",
    2015: "/content/eaglei_outages_2015.csv",
    2016: "/content/eaglei_outages_2016.csv",
    2017: "/content/eaglei_outages_2017.csv",
    2018: "/content/eaglei_outages_2018.csv",
    2019: "/content/eaglei_outages_2019.csv",
    2020: "/content/eaglei_outages_2020.csv",
    2021: "/content/eaglei_outages_2021.csv",
    2022: "/content/eaglei_outages_2022.csv",
    2023: "/content/eaglei_outages_2023.csv",
}

# CLEANED_OUTPUT_FILE = "/content/eaglei_outages_cleaned.csv" # Removed merged output file

print("Planned files for analysis (2014–2023):")
for y, p in YEAR_TO_PATH.items():
    print(f" - {y}: {p}")

# -------- Process, Clean, and Export Data Per Year --------
file_report = []   # to summarize loads per file
missing_files = []
total_original_rows = 0
total_cleaned_rows = 0
key_columns_used = set()


for yr, fp in YEAR_TO_PATH.items():
    if os.path.exists(fp):
        try:
            df_i = pd.read_csv(fp, low_memory=False)
            original_rows = len(df_i) # Store original row count
            total_original_rows += original_rows
            # force year column to exist and be int
            if "year" not in df_i.columns:
                df_i["year"] = yr
            else:
                df_i["year"] = pd.to_numeric(df_i["year"], errors="coerce").fillna(yr).astype(int)

            df_i["__source_file"] = os.path.basename(fp)

            # -------- Dynamic Column Detection for the current file --------
            customers_candidates = ["customers_out", "customers", "cust_out", "customers_outage", "outage_customers"]
            fips_candidates      = ["county_fips", "fips", "fips_code", "county_fips_code", "GEOID"]
            county_name_candidates = ["county", "county_name"]
            lat_candidates       = ["lat", "latitude", "y", "Lat", "Latitude"]
            lon_candidates       = ["lon", "longitude", "x", "Lon", "Longitude", "long"]
            time_candidates      = ["run_start_time", "start_time", "time", "timestamp", "datetime"]

            CUSTOMERS_COL   = next((c for c in customers_candidates if c in df_i.columns), None)
            FIPS_COL        = next((c for c in fips_candidates if c in df_i.columns), None)
            COUNTY_NAME_COL = next((c for c in county_name_candidates if c in df_i.columns), None)
            LAT_COL         = next((c for c in lat_candidates if c in df_i.columns), None)
            LON_COL         = next((c for c in lon_candidates if c in df_i.columns), None)
            TIME_COL        = next((c for c in time_candidates if c in df_i.columns), None)

            # Key sanity check: customers + year must exist
            required_any = [CUSTOMERS_COL, "year"]
            if any(col is None for col in required_any):
                 print(f"[WARN] Skipping file {fp}: Required columns missing. Detected columns: customers={CUSTOMERS_COL}, year={'year' in df_i.columns}")
                 file_report.append({"year": yr, "path": fp, "original_rows": original_rows, "cleaned_rows": 0, "status": f"SKIPPED: Missing required columns"})
                 continue # Skip to next file

            # -------- Data Cleaning for the current file --------
            key_cols = [c for c in [FIPS_COL, CUSTOMERS_COL, "year", TIME_COL] if c is not None and c in df_i.columns]
            for col in key_cols:
                key_columns_used.add(col)

            before = len(df_i)
            df_cleaned = df_i.dropna(subset=key_cols).copy()
            print(f"[OK] Cleaned {fp}: Dropped rows with missing values in key columns ({before - len(df_cleaned):,} rows removed).")

            # Standardize FIPS if present and split into state and county
            if FIPS_COL and FIPS_COL in df_cleaned.columns:
                df_cleaned[FIPS_COL] = (
                    df_cleaned[FIPS_COL]
                    .astype(str)
                    .str.replace(r"\.0$", "", regex=True)
                    .str.zfill(5)
                )
                # Split into State and County FIPS columns
                df_cleaned['state_fips'] = df_cleaned[FIPS_COL].str[:2].str.zfill(2)
                df_cleaned['county_fips'] = df_cleaned[FIPS_COL].str[2:].str.zfill(3)
                print(f"[OK] Cleaned {fp}: Formatted '{FIPS_COL}' as 5-digit strings and split into 'state_fips' and 'county_fips'.")
                # Drop the original FIPS column
                df_cleaned.drop(columns=[FIPS_COL], inplace=True)
                print(f"[OK] Cleaned {fp}: Dropped original '{FIPS_COL}' column.")


            # Standardize time if present
            if TIME_COL and TIME_COL in df_cleaned.columns:
                before_time = len(df_cleaned)
                df_cleaned[TIME_COL] = pd.to_datetime(df_cleaned[TIME_COL], errors="coerce", utc=False)
                df_cleaned.dropna(subset=[TIME_COL], inplace=True)
                print(f"[OK] Cleaned {fp}: Converted '{TIME_COL}' to datetime. Removed {before_time - len(df_cleaned):,} rows with invalid timestamps.")

            # Ensure customers numeric
            if CUSTOMERS_COL and CUSTOMERS_COL in df_cleaned.columns:
                 df_cleaned[CUSTOMERS_COL] = pd.to_numeric(df_cleaned[CUSTOMERS_COL], errors="coerce").fillna(0)


            # Final output columns for cleaned data (include new state and county FIPS if created)
            output_cols = [col for col in [COUNTY_NAME_COL, CUSTOMERS_COL, "year", TIME_COL, "__source_file"] if col and col in df_cleaned.columns]
            if 'state_fips' in df_cleaned.columns:
                 output_cols.append('state_fips')
            if 'county_fips' in df_cleaned.columns:
                 output_cols.append('county_fips')

            df_cleaned = df_cleaned[output_cols].copy()


            # -------- Export Cleaned Data for the current year --------
            cleaned_output_file_yearly = f"/content/eaglei_outages_cleaned_{yr}.csv"
            export_dir = os.path.dirname(cleaned_output_file_yearly)
            if export_dir and not os.path.exists(export_dir):
                os.makedirs(export_dir)
                print(f"[INFO] Created directory: {export_dir}")

            try:
                df_cleaned.to_csv(cleaned_output_file_yearly, index=False)
                print(f"[OK] Cleaned data exported for {yr} to: {cleaned_output_file_yearly}")
                file_report.append({"year": yr, "path": fp, "original_rows": original_rows, "cleaned_rows": len(df_cleaned), "status": "EXPORTED"})
                total_cleaned_rows += len(df_cleaned)
            except Exception as e:
                print(f"[WARN] Could not export cleaned data for {yr} to CSV: {e}")
                file_report.append({"year": yr, "path": fp, "original_rows": original_rows, "cleaned_rows": len(df_cleaned), "status": f"EXPORT_ERROR: {e}"})

        except Exception as e:
            print(f"[WARN] Error processing file {fp}: {e}")
            file_report.append({"year": yr, "path": fp, "original_rows": original_rows if 'original_rows' in locals() else 0, "cleaned_rows": 0, "status": f"ERROR: {e}"})
    else:
        missing_files.append(fp)
        file_report.append({"year": yr, "path": fp, "original_rows": 0, "cleaned_rows": 0, "status": "MISSING"})

# Print a quick report
print("\n=== File Load and Processing Report ===")
if file_report:
    rep = pd.DataFrame(file_report).sort_values("year")
    print(rep.to_string(index=False))
else:
    print("No files were checked.")

if missing_files:
    print("\n[WARN] Missing files (not found on disk):")
    for m in missing_files:
        print(f" - {m}")

print("\n=== Summary ===")
print(f"Total original rows across all files: {total_original_rows:,}")
print(f"Total rows remaining after cleaning: {total_cleaned_rows:,}")
print(f"Key columns used for analysis: {', '.join(key_columns_used)}")


print("\n✅ Complete: Data integration and cleaning for EAGLE-I outage data (2014–2023), saved as individual yearly files with state and county FIPS.")

Planned files for analysis (2014–2023):
 - 2014: /content/eaglei_outages_2014.csv
 - 2015: /content/eaglei_outages_2015.csv
 - 2016: /content/eaglei_outages_2016.csv
 - 2017: /content/eaglei_outages_2017.csv
 - 2018: /content/eaglei_outages_2018.csv
 - 2019: /content/eaglei_outages_2019.csv
 - 2020: /content/eaglei_outages_2020.csv
 - 2021: /content/eaglei_outages_2021.csv
 - 2022: /content/eaglei_outages_2022.csv
 - 2023: /content/eaglei_outages_2023.csv
[OK] Cleaned /content/eaglei_outages_2014.csv: Dropped rows with missing values in key columns (0 rows removed).
[OK] Cleaned /content/eaglei_outages_2014.csv: Formatted 'fips_code' as 5-digit strings and split into 'state_fips' and 'county_fips'.
[OK] Cleaned /content/eaglei_outages_2014.csv: Dropped original 'fips_code' column.
[OK] Cleaned /content/eaglei_outages_2014.csv: Converted 'run_start_time' to datetime. Removed 0 rows with invalid timestamps.
[OK] Cleaned data exported for 2014 to: /content/eaglei_outages_cleaned_2014.csv
