In [1]:
import pyspark
from pyspark.sql import SparkSession

# Step 1: Create a SparkSession

In [14]:
# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Fire Incident Analysis") \
    .getOrCreate()

# Step 2: Load Data
df = spark.read.csv("clean_firedata.csv", header=True, inferSchema=True)

In [15]:
# Step 2: Data Exploration

In [16]:
# Step 2: Data Exploration
print("Schema:")
df.printSchema()

print("First few rows:")
df.show(5)

Schema:
root
 |-- Year: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- IncType: string (nullable = true)
 |-- ParentPropertyType: string (nullable = true)
 |-- NumFireDeaths: integer (nullable = true)
 |-- NumAllFireInjuries: integer (nullable = true)
 |-- IncGeo_BoroughName: string (nullable = true)
 |-- IncGeo_WardName: string (nullable = true)
 |-- IgnitionSource: string (nullable = true)
 |-- ItemFirstIgnited: string (nullable = true)
 |-- LocationFireStarted: string (nullable = true)
 |-- ApplianceManufacturer: string (nullable = true)
 |-- MainCauseModel: string (nullable = true)

First few rows:
+----+-----+------------+--------------------+-------------+------------------+------------------+-------------------+--------------------+--------------------+-------------------+---------------------+--------------------+
|Year|Month|     IncType|  ParentPropertyType|NumFireDeaths|NumAllFireInjuries|IncGeo_BoroughName|    IncGeo_WardName|      IgnitionSource|    It

# Step 3: Data Processing (Example: Filtering out irrelevant columns)

In [18]:
# Filtering out columns
relevant_columns = ['Year', 'NumFireDeaths', 'NumAllFireInjuries', 'IncGeo_BoroughName', 'IncGeo_WardName', 'IgnitionSource', 'LocationFireStarted','ApplianceManufacturer']
df_processed = df.select(*relevant_columns)

# Step 4: Data Querying

In [19]:
# Return all records where a death occured

df_processed.createOrReplaceTempView("fire_incidents")
result = spark.sql("SELECT * FROM fire_incidents WHERE NumFireDeaths > 0 ORDER BY NumFireDeaths DESC")
print("Fire incidents with deaths:")
result.show()

Fire incidents with deaths:
+----+-------------+------------------+--------------------+----------------+--------------------+--------------------+---------------------+
|Year|NumFireDeaths|NumAllFireInjuries|  IncGeo_BoroughName| IncGeo_WardName|      IgnitionSource| LocationFireStarted|ApplianceManufacturer|
+----+-------------+------------------+--------------------+----------------+--------------------+--------------------+---------------------+
|2017|           71|               109|Kensington and ch...|    Notting dale|fridge/freezer - ...|             kitchen|             Hotpoint|
|2011|            6|                 2|               Brent|     Dollis hill|      fridge/freezer|       corridor/hall|            Whirlpool|
|2023|            2|                 0|              Harrow|         Edgware|tumble dryer - co...|             kitchen|             Hotpoint|
|2010|            1|                 0|              Merton|    Lower morden|                NULL|             kitchen| 

In [6]:
# Return all records where a death occured and sort by the number of injuries

result = spark.sql("SELECT * FROM fire_incidents WHERE NumAllFireInjuries > 0 ORDER BY NumAllFireInjuries DESC")
print("Fire incidents with Injuries:")
result.show()



Fire incidents with Injuries:
+----+-------------+------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
|Year|NumFireDeaths|NumAllFireInjuries|  IncGeo_BoroughName|     IncGeo_WardName|      IgnitionSource| LocationFireStarted|ApplianceManufacturer|
+----+-------------+------------------+--------------------+--------------------+--------------------+--------------------+---------------------+
|2017|           71|               109|Kensington and ch...|        Notting dale|fridge/freezer - ...|             kitchen|             Hotpoint|
|2010|            0|                19|            Lewisham|            Deptford|      fridge/freezer|             kitchen|                 Beko|
|2014|            0|                 7|          Wandsworth|         Falconbrook|fridge/freezer - ...|             kitchen|             Hotpoint|
|2018|            0|                 7|Barking and dagenham|           Becontree|fridge - free

In [20]:
# Return all records where a death occured and injuries occured sorting by number of deaths

result = spark.sql("SELECT * FROM fire_incidents WHERE NumFireDeaths > 0 AND NumAllFireInjuries > 0 ORDER BY NumFireDeaths DESC")
print("Fire incidents with deaths and injuries:")
result.show()



Fire incidents with deaths and injuries:
+----+-------------+------------------+--------------------+----------------+--------------------+--------------------+---------------------+
|Year|NumFireDeaths|NumAllFireInjuries|  IncGeo_BoroughName| IncGeo_WardName|      IgnitionSource| LocationFireStarted|ApplianceManufacturer|
+----+-------------+------------------+--------------------+----------------+--------------------+--------------------+---------------------+
|2017|           71|               109|Kensington and ch...|    Notting dale|fridge/freezer - ...|             kitchen|             Hotpoint|
|2011|            6|                 2|               Brent|     Dollis hill|      fridge/freezer|       corridor/hall|            Whirlpool|
|2010|            1|                 3|              Harrow|Wealdstone south|      fridge/freezer|             kitchen|                 Beko|
|2014|            1|                 1|      Waltham forest|       Markhouse|fridge/freezer - ...|under sta

In [8]:
# Return bourough and a count of the number of recorded fires sorting largest first

result = spark.sql("SELECT IncGeo_BoroughName, COUNT(*) AS incident_count FROM fire_incidents GROUP BY IncGeo_BoroughName ORDER BY incident_count DESC LIMIT 10")
print("Fire incidents with BoroughName:")
result.show()



Fire incidents with BoroughName:
+------------------+--------------+
|IncGeo_BoroughName|incident_count|
+------------------+--------------+
|           Croydon|           247|
|            Barnet|           243|
|         Southwark|           194|
|          Lewisham|           189|
|           Lambeth|           185|
|            Ealing|           184|
|       Westminster|           178|
|        Wandsworth|           177|
|           Bromley|           171|
|        Hillingdon|           160|
+------------------+--------------+



In [9]:
# Return year and a count of the number of recorded fires sorting largest first

result = spark.sql("SELECT Year, COUNT(*) AS incident_count FROM fire_incidents GROUP BY Year ORDER BY incident_count DESC LIMIT 10")
print("Fire incidents with Year:")
result.show()



Fire incidents with Year:
+----+--------------+
|Year|incident_count|
+----+--------------+
|2018|           360|
|2012|           341|
|2017|           334|
|2019|           331|
|2010|           329|
|2011|           329|
|2015|           323|
|2013|           322|
|2014|           319|
|2009|           305|
+----+--------------+



In [10]:
result = spark.sql("SELECT * FROM fire_incidents WHERE IncGeo_BoroughName = 'BARNET' AND Year ='2018' ORDER BY LocationFireStarted ASC, ApplianceManufacturer ASC")
print("Fire incidents for Barnet in the year 2018:")

Fire incidents for Barnet in the year 2018:


In [11]:
# Step 6: Stop SparkSession
spark.stop()