Loading January, February, March & April meter data.

In [0]:
from pyspark.sql.functions import col

# 1) Load Jan/Feb/Mar/Apr CSVs
jan_path = "/Volumes/workspace/default/osu-energy-analysis/DATA I-O 2026 Advanced Datasets/advanced_core/advanced_core/meter-readings-jan-2025.csv"
feb_path = "/Volumes/workspace/default/osu-energy-analysis/DATA I-O 2026 Advanced Datasets/advanced_core/advanced_core/meter-readings-feb-2025.csv"
mar_path = "/Volumes/workspace/default/osu-energy-analysis/DATA I-O 2026 Advanced Datasets/advanced_core/advanced_core/meter-readings-march-2025.csv"
apr_path = "/Volumes/workspace/default/osu-energy-analysis/DATA I-O 2026 Advanced Datasets/advanced_core/advanced_core/meter-readings-april-2025.csv"

def load_csv(path):
    return (spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .csv(path))

jan = load_csv(jan_path)
feb = load_csv(feb_path)
mar = load_csv(mar_path)
apr = load_csv(apr_path)

print("JAN:", jan.count(), "rows")
print("FEB:", feb.count(), "rows")
print("MAR:", mar.count(), "rows")
print("APR:", apr.count(), "rows")

# 2) Combine ALL months

meters = (
    jan.unionByName(feb, allowMissingColumns=True)
       .unionByName(mar, allowMissingColumns=True)
       .unionByName(apr, allowMissingColumns=True)
)

print("TOTAL BEFORE CLEAN:", meters.count())

# 3) Remove nulls in key columns
key_cols = ["readingtime", "readingvalue", "sitename", "utility"]

# (Optional) verify columns exist
missing = [c for c in key_cols if c not in meters.columns]
if missing:
    raise ValueError(f"Missing columns in data: {missing}. Available columns: {meters.columns}")

meters_clean = meters.na.drop(subset=key_cols)

print("TOTAL AFTER CLEAN:", meters_clean.count())
for c in key_cols:
    nulls = meters_clean.filter(col(c).isNull()).count()
    print(f"Nulls remaining in {c}: {nulls}")

# 4) Create schema + Save as Delta Table (THIS CREATES THE TABLE)
spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")

(meters_clean.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("osu_energy.meters_clean"))

print(" Saved Delta table: osu_energy.meters_clean")

# 5) Preview saved table
display(spark.table("osu_energy.meters_clean").limit(10))


Combine months (temporary for this notebook)

In [0]:
from pyspark.sql.functions import col

# A) Drop nulls in key columns
key_cols = ["readingtime", "readingvalue", "sitename", "utility"]

missing = [c for c in key_cols if c not in meters.columns]
if missing:
    raise ValueError(f"Missing columns in data: {missing}. Available columns: {meters.columns}")

meters_clean = meters.na.drop(subset=key_cols)

print("TOTAL BEFORE CLEAN:", meters.count())
print("TOTAL AFTER  CLEAN:", meters_clean.count())

# Optional sanity check (should be 0)
for c in key_cols:
    print(f"Nulls remaining in {c}:", meters_clean.filter(col(c).isNull()).count())

display(meters_clean.limit(10))

# B) Save as Delta table (creates table)
spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")

(meters_clean.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("osu_energy.meters_clean"))

print("Saved Delta table: osu_energy.meters_clean")

# confirm it saved
display(spark.table("osu_energy.meters_clean").limit(10))




Detect the timestamp & usage columns 

In [0]:
import pyspark.sql.functions as F


# 0) Make sure DB exists
spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")


# 1) Drop old tables (prevents schema mismatch)
spark.sql("DROP TABLE IF EXISTS osu_energy.meters_clean")
spark.sql("DROP TABLE IF EXISTS osu_energy.daily_trend")
spark.sql("DROP TABLE IF EXISTS osu_energy.monthly_trend")


# 2) Save BASE cleaned meter data (raw-level table)
(meters_clean.write
  .format("delta")
  .mode("overwrite")
  .saveAsTable("osu_energy.meters_clean"))


# 3) Build trends (make sure these match your column names)
#    meters_clean should already have: readingtime, readingvalue, sitename, utility

df = meters_clean.withColumn("ts", F.to_timestamp("readingtime")) \
                 .withColumn("usage", F.col("readingvalue").cast("double"))

daily_trend = (df
    .withColumn("day", F.date_trunc("day", F.col("ts")))
    .groupBy("day", "sitename", "utility")
    .agg(
        F.sum("usage").alias("total_usage"),
        F.count("*").alias("n_readings")
    )
)

monthly_trend = (df
    .withColumn("month", F.date_trunc("month", F.col("ts")))
    .groupBy("month", "sitename", "utility")
    .agg(
        F.sum("usage").alias("total_usage"),
        F.count("*").alias("n_readings")
    )
)


# 4) Save trends as separate Delta tables

(daily_trend.write
  .format("delta")
  .mode("overwrite")
  .saveAsTable("osu_energy.daily_trend"))

(monthly_trend.write
  .format("delta")
  .mode("overwrite")
  .saveAsTable("osu_energy.monthly_trend"))

print("Saved Delta tables:")
print(" - osu_energy.meters_clean")
print(" - osu_energy.daily_trend")
print(" - osu_energy.monthly_trend")

# 5) Quick verify
spark.sql("SHOW TABLES IN osu_energy").show(truncate=False)

display(spark.table("osu_energy.meters_clean").limit(10))
display(spark.table("osu_energy.daily_trend").orderBy(F.col("day").desc()).limit(10))
display(spark.table("osu_energy.monthly_trend").orderBy(F.col("month").desc()).limit(10))


Create month/day/hour fields

In [0]:
from pyspark.sql.functions import month, dayofmonth, hour

df = (df
      .withColumn("month", month(col("ts")))
      .withColumn("day", dayofmonth(col("ts")))
      .withColumn("hour", hour(col("ts"))))

display(df.select("ts","month","day","hour",energy_col).limit(10))


Campus-wide monthly trend

In [0]:

# Save monthly trend as Delta table

spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")

(monthly_trend.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("osu_energy.monthly_trend"))

print("Saved table: osu_energy.monthly_trend")

# Preview saved table
display(spark.table("osu_energy.monthly_trend"))


Campus-wide hourly trend (peak hour insight)

In [0]:

# Save hourly trend as Delta table

spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")

(hourly_trend.write
    .mode("overwrite")
    .format("delta")
    .saveAsTable("osu_energy.hourly_trend"))

print("Saved table: osu_energy.hourly_trend")

# Preview saved table
display(spark.table("osu_energy.hourly_trend").orderBy("hour"))


Daily trend

In [0]:
# Save daily_trend as Delta table
spark.sql("CREATE DATABASE IF NOT EXISTS osu_energy")

(daily_trend.write
  .mode("overwrite")
  .format("delta")
  .saveAsTable("osu_energy.daily_trend"))

print("Saved table: osu_energy.daily_trend")

# Preview saved table (use the REAL columns in your table)
display(
  spark.table("osu_energy.daily_trend")
       .orderBy("day", "utility", "sitename")
       .limit(200)
)


Identify spikes (top 20 highest readings)

In [0]:
from pyspark.sql.functions import col

# Force correct energy column
energy_col = "readingvalue"

spikes = (
    df.select(
        "ts",
        col(energy_col).cast("double").alias("energy_value")
    )
    .orderBy(col("energy_value").desc())
)

display(spikes.limit(20))
