In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("IoT Power Usage Analyzer").getOrCreate()

In [2]:
import os
print(os.environ.get("HADOOP_HOME"))


C:\hadoop


In [3]:
iot_df = spark.read.csv("data\iot_power_usage.csv",header=True,inferSchema=True)

In [4]:
household_df = spark.read.csv("data\household_info.csv",header=True,inferSchema=True)

In [5]:
iot_df.show(5)

+-------------------+------------+-------+---------+-------------+------+
|          timestamp|household_id|voltage|usage_kwh|device_status|region|
+-------------------+------------+-------+---------+-------------+------+
|2025-05-01 00:00:00|       HH000| 229.58|    0.249|           ON|  East|
|2025-05-01 00:00:00|       HH001| 226.37|    0.208|           ON| South|
|2025-05-01 00:00:00|       HH002| 231.92|    0.005|           ON| South|
|2025-05-01 00:00:00|       HH003| 230.77|    0.106|           ON|  East|
|2025-05-01 00:00:00|       HH004| 232.31|    0.072|           ON|  East|
+-------------------+------------+-------+---------+-------------+------+
only showing top 5 rows



In [6]:
household_df.show(5)

+------------+----------+------+------------+
|household_id|owner_name|region|install_date|
+------------+----------+------+------------+
|       HH000| Owner_005|  West|  2024-10-22|
|       HH001| Owner_008|  East|  2024-10-22|
|       HH002| Owner_005|  East|  2024-10-22|
|       HH003| Owner_003|  East|  2024-10-22|
|       HH004| Owner_001|  East|  2024-10-22|
+------------+----------+------+------------+
only showing top 5 rows



In [7]:
iot_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- household_id: string (nullable = true)
 |-- voltage: double (nullable = true)
 |-- usage_kwh: double (nullable = true)
 |-- device_status: string (nullable = true)
 |-- region: string (nullable = true)



In [8]:
iot_df.count()

8650

In [9]:
household_df.printSchema()

root
 |-- household_id: string (nullable = true)
 |-- owner_name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- install_date: date (nullable = true)



In [10]:
household_df.count()

10

In [11]:
iot_df.createOrReplaceTempView("iot_usage")


In [12]:
household_df.createOrReplaceTempView("household_info")

In [13]:
query = """ SELECT * FROM IOT_USAGE

"""
spark.sql(query).show(10,truncate=False)

+-------------------+------------+-------+---------+-------------+------+
|timestamp          |household_id|voltage|usage_kwh|device_status|region|
+-------------------+------------+-------+---------+-------------+------+
|2025-05-01 00:00:00|HH000       |229.58 |0.249    |ON           |East  |
|2025-05-01 00:00:00|HH001       |226.37 |0.208    |ON           |South |
|2025-05-01 00:00:00|HH002       |231.92 |0.005    |ON           |South |
|2025-05-01 00:00:00|HH003       |230.77 |0.106    |ON           |East  |
|2025-05-01 00:00:00|HH004       |232.31 |0.072    |ON           |East  |
|2025-05-01 00:00:00|HH005       |230.27 |0.165    |ON           |West  |
|2025-05-01 00:00:00|HH006       |224.65 |0.303    |ON           |South |
|2025-05-01 00:00:00|HH007       |234.05 |0.135    |OFF          |East  |
|2025-05-01 00:00:00|HH008       |231.21 |0.014    |OFF          |East  |
|2025-05-01 00:00:00|HH009       |234.99 |0.007    |ON           |West  |
+-------------------+------------+----

In [14]:
spark.sql("""
          SELECT * FROM household_info
""").show(10,truncate=False)

+------------+----------+------+------------+
|household_id|owner_name|region|install_date|
+------------+----------+------+------------+
|HH000       |Owner_005 |West  |2024-10-22  |
|HH001       |Owner_008 |East  |2024-10-22  |
|HH002       |Owner_005 |East  |2024-10-22  |
|HH003       |Owner_003 |East  |2024-10-22  |
|HH004       |Owner_001 |East  |2024-10-22  |
|HH005       |Owner_003 |East  |2024-10-22  |
|HH006       |Owner_007 |North |2024-10-22  |
|HH007       |Owner_004 |West  |2024-10-22  |
|HH008       |Owner_003 |South |2024-10-22  |
|HH009       |Owner_007 |East  |2024-10-22  |
+------------+----------+------+------------+



In [18]:
spark.sql("""
            SELECT hse.household_id
                 , hse.owner_name
                 , hse.region
                 , round(sum(iot.usage_kwh),2) as total_usage
              FROM IOT_USAGE iot
            INNER JOIN household_info hse
              on iot.household_id = hse.household_id
            group by hse.household_id
                   , hse.owner_name
                   , hse.region
""").show(truncate=False)

+------------+----------+------+-----------+
|household_id|owner_name|region|total_usage|
+------------+----------+------+-----------+
|HH003       |Owner_003 |East  |170.42     |
|HH006       |Owner_007 |North |177.48     |
|HH004       |Owner_001 |East  |177.56     |
|HH008       |Owner_003 |South |177.99     |
|HH002       |Owner_005 |East  |173.56     |
|HH009       |Owner_007 |East  |172.72     |
|HH007       |Owner_004 |West  |161.08     |
|HH005       |Owner_003 |East  |178.22     |
|HH000       |Owner_005 |West  |177.7      |
|HH001       |Owner_008 |East  |170.75     |
+------------+----------+------+-----------+



In [20]:
spark.sql("""
            SELECT hse.household_id
                 , hse.owner_name
                 , hse.region
                 , round(sum(iot.usage_kwh),2) as total_usage
              FROM IOT_USAGE iot
            INNER JOIN household_info hse
              on iot.household_id = hse.household_id
            group by hse.household_id
                   , hse.owner_name
                   , hse.region
""").write.mode("overwrite").csv("data/processed/total_usage_by_household", header=True)

Py4JJavaError: An error occurred while calling o71.csv.
: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$writeAndCommit$3(FileFormatWriter.scala:275)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:275)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:392)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:420)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:392)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:860)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:842)


In [21]:
query_df = spark.sql("""
            SELECT hse.household_id
                 , hse.owner_name
                 , hse.region
                 , round(sum(iot.usage_kwh),2) as total_usage
              FROM IOT_USAGE iot
            INNER JOIN household_info hse
              on iot.household_id = hse.household_id
            group by hse.household_id
                   , hse.owner_name
                   , hse.region
""")

In [22]:
df = query_df.toPandas()

In [23]:
df.head()

Unnamed: 0,household_id,owner_name,region,total_usage
0,HH003,Owner_003,East,170.42
1,HH006,Owner_007,North,177.48
2,HH004,Owner_001,East,177.56
3,HH008,Owner_003,South,177.99
4,HH002,Owner_005,East,173.56


In [24]:
df.to_csv("data/processed/total_usage_by_household.csv",index=False,header=True)

In [25]:
query_df2 = spark.sql("""
             SELECT hse.region
                 , round(sum(iot.usage_kwh),2) as total_usage
              FROM IOT_USAGE iot
            INNER JOIN household_info hse
              on iot.household_id = hse.household_id
            group by  hse.region
""")

In [26]:
df2 = query_df2.toPandas()
df2.head()

Unnamed: 0,region,total_usage
0,South,177.99
1,East,1043.24
2,West,338.78
3,North,177.48


In [28]:
df2.to_csv("data/Processed/total_usage_by_region.csv",header=True,index=False)

In [29]:
spark.sql("""
        SELECT DATE(iot.timestamp) AS usage_date
             , round(sum(iot.usage_kwh),2) as total_usage
             , hse.region
          FROM IOT_USAGE iot
         INNER JOIN household_info hse
            ON iot.household_id = hse.household_id
         GROUP BY hse.region
             , DATE(iot.timestamp)
         ORDER BY usage_date,hse.region
""").toPandas().to_csv("data/Processed/daily_usage_by_region.csv",index=False,header=True)

In [30]:
spark.sql("""
         SELECT 
        *,
        ROW_NUMBER() OVER (PARTITION BY region ORDER BY total_usage DESC) AS rank_in_region
    FROM (
        SELECT 
            hse.household_id,
            hse.owner_name,
            hse.region,
            ROUND(SUM(iot.usage_kwh), 2) AS total_usage
        FROM 
            iot_usage iot
        JOIN 
            household_info hse
        ON 
            iot.household_id = hse.household_id
        GROUP BY 
            hse.household_id, hse.owner_name, hse.region
    )
""").toPandas().to_csv("data/processed/top_users_per_region.csv", index=False,header=True)