## PySpark Example Fire_Department_Calls_for_Service

## Find Spark

In [1]:
import findspark
findspark.init()
print("Done")

Done


## import

In [57]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, asc, desc
from pyspark.sql.types import *
import pyspark.sql.functions as F

from functools import reduce
import functools

## Create SparkContext, SparkSession

In [3]:
#conf=SparkConf().setMaster("local").setAppName("myApp")
#sc=SparkContext(conf=conf)
sc = SparkContext('local', 'myApp')
spark = SparkSession.builder.getOrCreate()

## Read .csv data to DataFrame

### Define a schema 

In [4]:
fireschema = StructType([StructField('Call Number', IntegerType(), True),
                     StructField('Unit ID', StringType(), True),
                     StructField('Incident Number', IntegerType(), True),
                     StructField('Call Type', StringType(), True),                  
                     StructField('Call Date', StringType(), True),       
                     StructField('Watch Date', StringType(), True),       
                     StructField('Received DtTm', StringType(), True),       
                     StructField('Entry DtTm', StringType(), True),       
                     StructField('Dispatch DtTm', StringType(), True),       
                     StructField('Response DtTm', StringType(), True),       
                     StructField('On Scene DtTm', StringType(), True),       
                     StructField('Transport DtTm', StringType(), True),                  
                     StructField('Hospital DtTm', StringType(), True),       
                     StructField('Call Final Disposition', StringType(), True),       
                     StructField('Available DtTm', StringType(), True),       
                     StructField('Address', StringType(), True),       
                     StructField('City', StringType(), True),       
                     StructField('Zipcode of Incident', IntegerType(), True),       
                     StructField('Battalion', StringType(), True),                 
                     StructField('Station Area', StringType(), True),       
                     StructField('Box', StringType(), True),       
                     StructField('Original Priority', StringType(), True),       
                     StructField('Priority', StringType(), True),       
                     StructField('Final Priority', IntegerType(), True),       
                     StructField('ALS Unit', BooleanType(), True),       
                     StructField('Call Type Group', StringType(), True),
                     StructField('Number of Alarms', IntegerType(), True),
                     StructField('Unit Type', StringType(), True),
                     StructField('Unit sequence in call dispatch', IntegerType(), True),
                     StructField('Fire Prevention District', StringType(), True),
                     StructField('Supervisor District', StringType(), True),
                     StructField('Neighborhood District', StringType(), True),
                     StructField('Location', StringType(), True),
                     StructField('RowID', StringType(), True)])

### read the file using format CSV

In [7]:
sf_fire_file = "Fire_Department_Calls_for_Service.csv"
fire_df =spark.read.csv(sf_fire_file, header=True, mode="DROPMALFORMED", schema=fireschema)

In [8]:
fire_df.count()

5225735

In [10]:
fire_df.columns

['Call Number',
 'Unit ID',
 'Incident Number',
 'Call Type',
 'Call Date',
 'Watch Date',
 'Received DtTm',
 'Entry DtTm',
 'Dispatch DtTm',
 'Response DtTm',
 'On Scene DtTm',
 'Transport DtTm',
 'Hospital DtTm',
 'Call Final Disposition',
 'Available DtTm',
 'Address',
 'City',
 'Zipcode of Incident',
 'Battalion',
 'Station Area',
 'Box',
 'Original Priority',
 'Priority',
 'Final Priority',
 'ALS Unit',
 'Call Type Group',
 'Number of Alarms',
 'Unit Type',
 'Unit sequence in call dispatch',
 'Fire Prevention District',
 'Supervisor District',
 'Neighborhood District',
 'Location',
 'RowID']

## Data Precesssing & Analysis

### How many different types of calls were made to the Fire Department?

In [38]:
callType_distinct = fire_df.select('Call Type').distinct()
num = callType_distinct.count()
print("Total number of different types of calls: ", num)

Total number of different types of calls:  32


In [39]:
callType_distinct.show(num, truncate = False)

+--------------------------------------------+
|Call Type                                   |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Lightning Strike (Investigation)            |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire

In [35]:
rdd1 = callType_distinct.rdd

In [36]:
rdd1.count()

32

In [37]:
rdd1.collect()

[Row(Call Type='Elevator / Escalator Rescue'),
 Row(Call Type='Marine Fire'),
 Row(Call Type='Aircraft Emergency'),
 Row(Call Type='Confined Space / Structure Collapse'),
 Row(Call Type='Administrative'),
 Row(Call Type='Alarms'),
 Row(Call Type='Odor (Strange / Unknown)'),
 Row(Call Type='Lightning Strike (Investigation)'),
 Row(Call Type='Citizen Assist / Service Call'),
 Row(Call Type='HazMat'),
 Row(Call Type='Watercraft in Distress'),
 Row(Call Type='Explosion'),
 Row(Call Type='Oil Spill'),
 Row(Call Type='Vehicle Fire'),
 Row(Call Type='Suspicious Package'),
 Row(Call Type='Train / Rail Fire'),
 Row(Call Type='Extrication / Entrapped (Machinery, Vehicle)'),
 Row(Call Type='Other'),
 Row(Call Type='Outside Fire'),
 Row(Call Type='Traffic Collision'),
 Row(Call Type='Assist Police'),
 Row(Call Type='Gas Leak (Natural and LP Gases)'),
 Row(Call Type='Water Rescue'),
 Row(Call Type='Electrical Hazard'),
 Row(Call Type='High Angle Rescue'),
 Row(Call Type='Structure Fire'),
 Row(Call

### How many incidents of each call type were there?

In [40]:
callType_df2 = fire_df.select("Call Type").groupBy('Call Type').count().sort(desc('count'))
# CallType_df1=fire_df.select("Call Type").groupBy("Call Type").count().sort('count',ascending=False)

In [41]:
callType_df2.show(num, truncate = False)

+--------------------------------------------+-------+
|Call Type                                   |count  |
+--------------------------------------------+-------+
|Medical Incident                            |3411696|
|Structure Fire                              |659644 |
|Alarms                                      |565194 |
|Traffic Collision                           |215006 |
|Other                                       |82489  |
|Citizen Assist / Service Call               |78089  |
|Outside Fire                                |62767  |
|Water Rescue                                |25482  |
|Vehicle Fire                                |24510  |
|Gas Leak (Natural and LP Gases)             |20990  |
|Electrical Hazard                           |15536  |
|Elevator / Escalator Rescue                 |14126  |
|Odor (Strange / Unknown)                    |12811  |
|Smoke Investigation (Outside)               |11646  |
|Fuel Spill                                  |5995   |
|HazMat   

### Rename Column's Name With functools.reduce



In [49]:
oldColumns = fire_df.columns
oldColumns

['Call Number',
 'Unit ID',
 'Incident Number',
 'Call Type',
 'Call Date',
 'Watch Date',
 'Received DtTm',
 'Entry DtTm',
 'Dispatch DtTm',
 'Response DtTm',
 'On Scene DtTm',
 'Transport DtTm',
 'Hospital DtTm',
 'Call Final Disposition',
 'Available DtTm',
 'Address',
 'City',
 'Zipcode of Incident',
 'Battalion',
 'Station Area',
 'Box',
 'Original Priority',
 'Priority',
 'Final Priority',
 'ALS Unit',
 'Call Type Group',
 'Number of Alarms',
 'Unit Type',
 'Unit sequence in call dispatch',
 'Fire Prevention District',
 'Supervisor District',
 'Neighborhood District',
 'Location',
 'RowID']

In [82]:
newColumns = ['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'ReceivedDtTm',
 'EntryDtTm',
 'DispatchDtTm',
 'ResponseDtTm',
 'OnSceneDtTm',
 'TransportDtTm',
 'HospitalDtTm',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'ZipcodeofIncident',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumberofAlarms',
 'UnitType',
 'Unitsequenceincalldispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'NeighborhoodDistrict',
 'Location',
 'RowID'
]

In [83]:
df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), fire_df)

In [84]:
df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'ReceivedDtTm',
 'EntryDtTm',
 'DispatchDtTm',
 'ResponseDtTm',
 'OnSceneDtTm',
 'TransportDtTm',
 'HospitalDtTm',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'ZipcodeofIncident',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumberofAlarms',
 'UnitType',
 'Unitsequenceincalldispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'NeighborhoodDistrict',
 'Location',
 'RowID']

In [94]:
df.select('CallType').distinct().show(5, False)

+-----------------------------------+
|CallType                           |
+-----------------------------------+
|Elevator / Escalator Rescue        |
|Marine Fire                        |
|Aircraft Emergency                 |
|Confined Space / Structure Collapse|
|Administrative                     |
+-----------------------------------+
only showing top 5 rows



### Extract date info from timestamp

In [95]:
df.select('CallDate', 'ReceivedDtTm').show(2, False)

+----------+----------------------+
|CallDate  |ReceivedDtTm          |
+----------+----------------------+
|07/25/2019|07/25/2019 07:16:45 PM|
|07/25/2019|07/25/2019 07:16:45 PM|
+----------+----------------------+
only showing top 2 rows



In [115]:
from_pattern1 = 'MM/dd/yyyy'
from_pattern2 = 'MM/dd/yyyy hh:mm:ss aa'


df_ts = df.withColumn('CallDateTS', F.unix_timestamp('CallDate', from_pattern1).cast("timestamp"))\
.withColumn('ReceivedDtTmTS', F.unix_timestamp('ReceivedDtTm', from_pattern2).cast("timestamp") )

In [116]:
df_ts.select('CallDate','CallDateTS', 'ReceivedDtTm', 'ReceivedDtTmTS').show(5, False)

+----------+-------------------+----------------------+--------------+
|CallDate  |CallDateTS         |ReceivedDtTm          |ReceivedDtTmTS|
+----------+-------------------+----------------------+--------------+
|07/25/2019|2019-07-25 00:00:00|07/25/2019 07:16:45 PM|null          |
|07/25/2019|2019-07-25 00:00:00|07/25/2019 07:16:45 PM|null          |
|07/25/2019|2019-07-25 00:00:00|07/25/2019 07:16:45 PM|null          |
|07/25/2019|2019-07-25 00:00:00|07/25/2019 07:16:45 PM|null          |
|07/25/2019|2019-07-25 00:00:00|07/25/2019 07:16:45 PM|null          |
+----------+-------------------+----------------------+--------------+
only showing top 5 rows



In [124]:
from pyspark.sql.functions import to_timestamp
 
fire_ts_df = (df.withColumn("IncidentDate", to_timestamp(df.CallDate, "MM/dd/yyyy"))
.withColumn("OnWatchDate", to_timestamp(df.WatchDate, "MM/dd/yyyy"))
.withColumn("AvailableDtTS", to_timestamp(df.AvailableDtTm, "MM/dd/yyyy hh:mm:ss aa")))

fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTS", "AvailableDtTm").distinct().show(3, False)

+-------------------+-------------------+-------------+----------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS|AvailableDtTm         |
+-------------------+-------------------+-------------+----------------------+
|2019-04-25 00:00:00|2019-04-25 00:00:00|null         |04/25/2019 03:28:56 PM|
|2019-04-25 00:00:00|2019-04-25 00:00:00|null         |04/25/2019 06:08:58 PM|
|2018-02-01 00:00:00|2018-02-01 00:00:00|null         |02/01/2018 12:38:05 PM|
+-------------------+-------------------+-------------+----------------------+
only showing top 3 rows



In [126]:
from pyspark.sql.functions import *
fire_ts_df.select(year('IncidentDate')).distinct().orderBy(year('IncidentDate')).show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
|              2019|
+------------------+
only showing top 20 rows



In [127]:
fire_ts_df.select(year('IncidentDate').alias('year')).distinct().sort(asc('year')).show()

+----+
|year|
+----+
|2000|
|2001|
|2002|
|2003|
|2004|
|2005|
|2006|
|2007|
|2008|
|2009|
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
+----+
only showing top 20 rows



### Filter Data

In [117]:
few_fire_df = (df.select("IncidentNumber", "AvailableDtTm", "CallType") 
           .where(df.CallType != "Medical Incident"))

In [118]:
call_df=(few_fire_df.select("CallType")).groupBy("CallType").count().sort('count',ascending=False)
call_df.show(5, False)

+-----------------------------+------+
|CallType                     |count |
+-----------------------------+------+
|Structure Fire               |659644|
|Alarms                       |565194|
|Traffic Collision            |215006|
|Other                        |82489 |
|Citizen Assist / Service Call|78089 |
+-----------------------------+------+
only showing top 5 rows



In [120]:
df.select("CallType").where(df.CallType != "null").distinct().count()

32

In [121]:
few_fire_df.select("CallType").where(few_fire_df.CallType != "null").distinct().count()

31

### Change Column name

In [123]:
df2 = df.withColumnRenamed("CallNumber", "NumberOfCalls")
df2.columns[0]

'NumberOfCalls'

### What were the common types of fire calls?

In [128]:
fire_ts_df.select("CallType").where(col("CallType").isNotNull()).groupBy("CallType").count().orderBy(
    "count", ascending=False).show(10, truncate=False)

+-------------------------------+-------+
|CallType                       |count  |
+-------------------------------+-------+
|Medical Incident               |3411696|
|Structure Fire                 |659644 |
|Alarms                         |565194 |
|Traffic Collision              |215006 |
|Other                          |82489  |
|Citizen Assist / Service Call  |78089  |
|Outside Fire                   |62767  |
|Water Rescue                   |25482  |
|Vehicle Fire                   |24510  |
|Gas Leak (Natural and LP Gases)|20990  |
+-------------------------------+-------+
only showing top 10 rows



### What were all the different types of fire calls in 2018?

In [132]:
callType_df_2018 = fire_ts_df.select('CallType').where(year('IncidentDate')==2018).distinct()
num = callType_df_2018.count()
callType_df_2018.show(num, False)

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Police                               |
|Gas Leak (Na

### What were all the different types of fire calls in 2019?

In [134]:
callType_df_2019 = fire_ts_df.select('CallType').where(year('IncidentDate')==2018).distinct()
num = callType_df_2019.count()
print(num)
callType_df_2019.show(num, False)

29
+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Police                               |
|Gas Leak 

### Q1: What months within the year 2019 saw for the highest number of fire calls?

In [156]:
fire_ts_df_month = fire_ts_df.withColumn('month', F.month('IncidentDate'))\
                            .withColumn('year', F.year('IncidentDate'))

In [157]:
firecall_2019 = fire_ts_df_month.select('month').where(year('IncidentDate') == 2019)\
.groupby('month').count().sort(desc('count'))

In [159]:
print("Following month saw highest num of fire calls  in year 2019: ", firecall_2019.show(1))

+-----+-----+
|month|count|
+-----+-----+
|   10|29209|
+-----+-----+
only showing top 1 row

Following month saw highest num of fire calls  in year 2019:  None


In [160]:
firecall_2019.show()

+-----+-----+
|month|count|
+-----+-----+
|   10|29209|
|   12|28630|
|    9|28521|
|    8|27463|
|    6|27161|
|    3|26752|
|   11|26493|
|    5|25760|
|    7|25465|
|    4|25321|
|    2|24640|
|    1|24440|
+-----+-----+



### Show the month in the year 2018 saw the highest nuber of fire calls

In [161]:
firecall_2018 = fire_ts_df_month.select('month').where(year('IncidentDate') == 2018)\
.groupby('month').count().sort(desc('count'))

In [162]:
print("Following month saw highest num of fire calls  in year 2018: ", firecall_2018.show(1))

+-----+-----+
|month|count|
+-----+-----+
|    1|27027|
+-----+-----+
only showing top 1 row

Following month saw highest num of fire calls  in year 2018:  None


### Q2 Which neighborhood in SF generated the most fire calls in 2019?

In [163]:
(fire_ts_df.select("NeighborhoodDistrict").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "NeighborhoodDistrict").count().orderBy("count",ascending=False ).show(1,False)

+--------------------+-----+
|NeighborhoodDistrict|count|
+--------------------+-----+
|Tenderloin          |45852|
+--------------------+-----+
only showing top 1 row



In [167]:
fire_ts_df_month.select("NeighborhoodDistrict").where(fire_ts_df_month.year == 2019)\
.groupby("NeighborhoodDistrict").count().sort(desc('count')).show(truncate = False)

+------------------------------+-----+
|NeighborhoodDistrict          |count|
+------------------------------+-----+
|Tenderloin                    |45852|
|South of Market               |35007|
|Mission                       |28763|
|Financial District/South Beach|22652|
|Bayview Hunters Point         |17008|
|Sunset/Parkside               |11294|
|Western Addition              |10627|
|Nob Hill                      |10542|
|Hayes Valley                  |7940 |
|Castro/Upper Market           |7902 |
|Outer Richmond                |7661 |
|West of Twin Peaks            |7052 |
|Pacific Heights               |6709 |
|Marina                        |6417 |
|North Beach                   |6402 |
|Excelsior                     |6174 |
|Potrero Hill                  |5653 |
|Chinatown                     |5641 |
|Bernal Heights                |5392 |
|Mission Bay                   |5363 |
+------------------------------+-----+
only showing top 20 rows



### Q3 Which neighborhoods in SF had the worst response time to fire calls in 2018?
need to fix the null issue of timestamp to answer this question

### Q4 Which week in the year in 2019 had the most fire calls?

In [168]:
fire_ts_y_m_w_df = fire_ts_df_month.withColumn('week', F.weekofyear('IncidentDate'))

In [169]:
fire_ts_y_m_w_df.select('week').where(fire_ts_y_m_w_df.year == 2019).groupby('week').count()\
.sort(desc('count')).show()

+----+-----+
|week|count|
+----+-----+
|  43| 7311|
|   1| 7262|
|  49| 7055|
|  24| 7048|
|  39| 6981|
|  37| 6819|
|  41| 6754|
|  50| 6692|
|  23| 6635|
|  38| 6578|
|  44| 6432|
|  35| 6416|
|   6| 6361|
|  36| 6356|
|  46| 6346|
|  40| 6341|
|  11| 6293|
|  33| 6288|
|  51| 6245|
|   3| 6240|
+----+-----+
only showing top 20 rows



### Q5 Is there a correlation between neighborhood, zip code, and fire calls?

In [170]:
(fire_ts_df.select("NeighborhoodDistrict","ZipcodeofIncident").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "NeighborhoodDistrict", "ZipcodeofIncident").count().orderBy("count",ascending=False ).show(5,False)

+---------------------+-----------------+-----+
|NeighborhoodDistrict |ZipcodeofIncident|count|
+---------------------+-----------------+-----+
|Tenderloin           |94102            |34027|
|South of Market      |94103            |29156|
|Mission              |94110            |17538|
|Bayview Hunters Point|94124            |16013|
|Mission              |94103            |11168|
+---------------------+-----------------+-----+
only showing top 5 rows



In [171]:
(fire_ts_df.select("ZipcodeofIncident").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "ZipcodeofIncident").count().orderBy("count",ascending=False ).show(5,False)

+-----------------+-----+
|ZipcodeofIncident|count|
+-----------------+-----+
|94103            |43506|
|94102            |42591|
|94109            |28257|
|94110            |24592|
|94124            |16444|
+-----------------+-----+
only showing top 5 rows



In [172]:
(fire_ts_df.select("ZipcodeofIncident","NeighborhoodDistrict").where(
    fire_ts_df.NeighborhoodDistrict =="Tenderloin")).distinct().show()

+-----------------+--------------------+
|ZipcodeofIncident|NeighborhoodDistrict|
+-----------------+--------------------+
|            94102|          Tenderloin|
|            94109|          Tenderloin|
|            94103|          Tenderloin|
+-----------------+--------------------+



In [173]:

(fire_ts_df.select("NeighborhoodDistrict","ZipcodeofIncident").where(
    fire_ts_df.ZipcodeofIncident == 94102)).distinct().show()

+--------------------+-----------------+
|NeighborhoodDistrict|ZipcodeofIncident|
+--------------------+-----------------+
|    Western Addition|            94102|
|          Tenderloin|            94102|
|            Nob Hill|            94102|
|             Mission|            94102|
|     South of Market|            94102|
|Financial Distric...|            94102|
|        Hayes Valley|            94102|
+--------------------+-----------------+

