<a href="https://colab.research.google.com/github/victorialovefranklin/Toward-Climate-Resilient-Energy-Systems/blob/main/EAGLE_I_DATA__CLEANING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **EAGLE-I (2014-2023) Data Integration & Cleaning**

## **Task:** Load the 10 files Environment for Analysis of Geo-Located Energy Information (EAGLE-I) datasets across 2014 to 2023

In [3]:
# ==============================
# EAGLE-I (2014–2023) — Data Integration & Cleaning
# ==============================

import os
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

# -------- Explicit file list (each year, 2014–2023) --------
YEAR_TO_PATH = {
    2014: "/content/eaglei_outages_2014.csv",
    2015: "/content/eaglei_outages_2015.csv",
    2016: "/content/eaglei_outages_2016.csv",
    2017: "/content/eaglei_outages_2017.csv",
    2018: "/content/eaglei_outages_2018.csv",
    2019: "/content/eaglei_outages_2019.csv",
    2020: "/content/eaglei_outages_2020.csv",
    2021: "/content/eaglei_outages_2021.csv",
    2022: "/content/eaglei_outages_2022.csv",
    2023: "/content/eaglei_outages_2023.csv",
}

CLEANED_OUTPUT_FILE = "/content/eaglei_outages_cleaned.csv"

print("Planned files for analysis (2014–2023):")
for y, p in YEAR_TO_PATH.items():
    print(f" - {y}: {p}")

# -------- Load and Concatenate Data --------
frames = []
file_report = []   # to summarize loads per file
missing_files = []

for yr, fp in YEAR_TO_PATH.items():
    if os.path.exists(fp):
        try:
            df_i = pd.read_csv(fp, low_memory=False)
            # force year column to exist and be int
            if "year" not in df_i.columns:
                df_i["year"] = yr
            else:
                df_i["year"] = pd.to_numeric(df_i["year"], errors="coerce").fillna(yr).astype(int)

            df_i["__source_file"] = os.path.basename(fp)
            frames.append(df_i)
            file_report.append({"year": yr, "path": fp, "loaded_rows": len(df_i), "status": "OK"})
        except Exception as e:
            file_report.append({"year": yr, "path": fp, "loaded_rows": 0, "status": f"ERROR: {e}"})
    else:
        missing_files.append(fp)
        file_report.append({"year": yr, "path": fp, "loaded_rows": 0, "status": "MISSING"})

# Print a quick report
print("\n=== File Load Report ===")
if file_report:
    rep = pd.DataFrame(file_report).sort_values("year")
    print(rep.to_string(index=False))
else:
    print("No files were checked.")

if missing_files:
    print("\n[WARN] Missing files (not found on disk):")
    for m in missing_files:
        print(f" - {m}")

if not frames:
    raise RuntimeError("No CSVs loaded for years 2014–2023. Check file paths or file availability.")

# Concatenate and enforce year range
df = pd.concat(frames, ignore_index=True)
df = df[(df["year"] >= 2014) & (df["year"] <= 2023)].copy()
df.drop_duplicates(inplace=True)
print(f"\n[OK] Loaded {len(df):,} rows across {len(frames)} files (2014–2023).")

# -------- Dynamic Column Detection --------
customers_candidates = ["customers_out", "customers", "cust_out", "customers_outage", "outage_customers"]
fips_candidates      = ["county_fips", "fips", "fips_code", "county_fips_code", "GEOID"]
county_name_candidates = ["county", "county_name"]
lat_candidates       = ["lat", "latitude", "y", "Lat", "Latitude"]
lon_candidates       = ["lon", "longitude", "x", "Lon", "Longitude", "long"]
time_candidates      = ["run_start_time", "start_time", "time", "timestamp", "datetime"]

def first_existing(frame, candidates):
    return next((c for c in candidates if c in frame.columns), None)

CUSTOMERS_COL   = first_existing(df, customers_candidates)
FIPS_COL        = first_existing(df, fips_candidates)
COUNTY_NAME_COL = first_existing(df, county_name_candidates)
LAT_COL         = first_existing(df, lat_candidates)
LON_COL         = first_existing(df, lon_candidates)
TIME_COL        = first_existing(df, time_candidates)

# Key sanity check: customers + year must exist
required_any = [CUSTOMERS_COL, "year"]
if any(col is None for col in required_any):
    raise ValueError(f"Required columns missing. Detected columns: customers={CUSTOMERS_COL}, year={'year' in df.columns}")

# -------- Data Cleaning --------
key_cols = [c for c in [FIPS_COL, CUSTOMERS_COL, "year", TIME_COL] if c is not None]
before = len(df)
df_cleaned = df.dropna(subset=key_cols).copy()
print(f"[OK] Dropped rows with missing values in key columns ({before - len(df_cleaned):,} rows removed).")

# Standardize FIPS if present
if FIPS_COL in df_cleaned.columns:
    df_cleaned[FIPS_COL] = (
        df_cleaned[FIPS_COL]
        .astype(str)
        .str.replace(r"\.0$", "", regex=True)
        .str.zfill(5)
    )
    print(f"[OK] Cleaned and formatted '{FIPS_COL}' as 5-digit strings.")

# Standardize time if present
if TIME_COL in df_cleaned.columns:
    df_cleaned[TIME_COL] = pd.to_datetime(df_cleaned[TIME_COL], errors="coerce", utc=False)
    before_time = len(df_cleaned)
    df_cleaned.dropna(subset=[TIME_COL], inplace=True)
    print(f"[OK] Converted '{TIME_COL}' to datetime. Removed {before_time - len(df_cleaned):,} rows with invalid timestamps.")

# Ensure customers numeric
df_cleaned[CUSTOMERS_COL] = pd.to_numeric(df_cleaned[CUSTOMERS_COL], errors="coerce").fillna(0)

# Final output columns for cleaned data
output_cols = [col for col in [FIPS_COL, COUNTY_NAME_COL, CUSTOMERS_COL, "year", TIME_COL, "__source_file"] if col and col in df_cleaned.columns]
df_cleaned = df_cleaned[output_cols].copy()

# -------- Per-year counts to confirm inclusion --------
year_counts = df_cleaned.groupby("year")[CUSTOMERS_COL].size().rename("rows_loaded").reset_index()
print("\n=== Rows Loaded Per Year (post-cleaning) ===")
print(year_counts.to_string(index=False))

# -------- Export Cleaned Data --------
try:
    df_cleaned.to_csv(CLEANED_OUTPUT_FILE, index=False)
    print(f"\n[OK] Cleaned data exported to: {CLEANED_OUTPUT_FILE}")
except Exception as e:
    print(f"[WARN] Could not export cleaned data to CSV: {e}")

print("\n✅ Complete: Data integration and cleaning for EAGLE-I outage data (2014–2023).")



Planned files for analysis (2014–2023):
 - 2014: /content/eaglei_outages_2014.csv
 - 2015: /content/eaglei_outages_2015.csv
 - 2016: /content/eaglei_outages_2016.csv
 - 2017: /content/eaglei_outages_2017.csv
 - 2018: /content/eaglei_outages_2018.csv
 - 2019: /content/eaglei_outages_2019.csv
 - 2020: /content/eaglei_outages_2020.csv
 - 2021: /content/eaglei_outages_2021.csv
 - 2022: /content/eaglei_outages_2022.csv
 - 2023: /content/eaglei_outages_2023.csv

=== File Load Report ===
 year                             path  loaded_rows status
 2014 /content/eaglei_outages_2014.csv        25856     OK
 2015 /content/eaglei_outages_2015.csv        30748     OK
 2016 /content/eaglei_outages_2016.csv        24333     OK
 2017 /content/eaglei_outages_2017.csv        24578     OK
 2018 /content/eaglei_outages_2018.csv        17897     OK
 2019 /content/eaglei_outages_2019.csv        57796     OK
 2020 /content/eaglei_outages_2020.csv        40257     OK
 2021 /content/eaglei_outages_2021.csv    

 # **Data Integration and Cleaning Results**



**EAGLE-I Outage Data 2014-2023**

10 files from the Environment for Analysis of Geo-Located Energy Information (EAGLE-I) datasets across 2014 to 2023 were manually filtered for the State of California. The 10 files were concatenated and merged into a single file named “eaglei_outages_cleaned.csv.” From those 10 files 409,920 rows were loaded successfully into Google Collab Python Notebook. The key candidate columns used such as 'customers out', fips_code, county, lat, lon, and run_start_time were analyzed to detect, identify, extract, and normalize columns fields across datasets to ensure consistent schema. As a result, 27,589 rows were removed, 382,331 rows remained.

Row Counts per Year: A summary of the number of rows loaded per year after cleaning was provided, confirming the inclusion of data from 2014 to 2017 and 2019 to 2023. Data for 2018 is notably absent in the post-cleaning count, although it was reported as loaded. ORNL added 2024 outage data on 4/10/2025, however no customer outage coverage was reported for California.

**Table 1: Pre-Cleaning (EAGLE-I 2014–2023)**

| Year | File Path                | Rows Loaded |
|------|--------------------------|-------------|
| 2014 | eaglei_outages_2014.csv  | 25,856      |
| 2015 | eaglei_outages_2015.csv  | 30,748      |
| 2016 | eaglei_outages_2016.csv  | 24,333      |
| 2017 | eaglei_outages_2017.csv  | 24,578      |
| 2018 | eaglei_outages_2018.csv  | 17,897      |
| 2019 | eaglei_outages_2019.csv  | 57,796      |
| 2020 | eaglei_outages_2020.csv  | 40,257      |
| 2021 | eaglei_outages_2021.csv  | 57,876      |
| 2022 | eaglei_outages_2022.csv  | 56,041      |
| 2023 | eaglei_outages_2023.csv  | 74,538      |

**Table 2. Post Cleaning (EAGLE-I 2014–2023)**

| Year | File Path                | Rows Loaded |
|------|--------------------------|-------------|
| 2014 | eaglei_outages_2014.csv  | 25,856      |
| 2015 | eaglei_outages_2015.csv  | 30,748      |
| 2016 | eaglei_outages_2016.csv  | 24,333      |
| 2017 | eaglei_outages_2017.csv  | 24,578      |
| 2019 | eaglei_outages_2018.csv  | 51,198      |
| 2020 | eaglei_outages_2019.csv  | 39,811      |
| 2021 | eaglei_outages_2020.csv  | 55,228      |
| 2022 | eaglei_outages_2021.csv  | 56,041      |
| 2023 | eaglei_outages_2022.csv  | 74,538      |
