In [1]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.types import *

conf=SparkConf().setMaster("local").setAppName("myApp")
sc=SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [2]:
# Define a schema 
fireschema = StructType([StructField('Call Number', IntegerType(), True),
                     StructField('Unit ID', StringType(), True),
                     StructField('Incident Number', IntegerType(), True),
                     StructField('Call Type', StringType(), True),                  
                     StructField('Call Date', StringType(), True),       
                     StructField('Watch Date', StringType(), True),       
                     StructField('Received DtTm', StringType(), True),       
                     StructField('Entry DtTm', StringType(), True),       
                     StructField('Dispatch DtTm', StringType(), True),       
                     StructField('Response DtTm', StringType(), True),       
                     StructField('On Scene DtTm', StringType(), True),       
                     StructField('Transport DtTm', StringType(), True),                  
                     StructField('Hospital DtTm', StringType(), True),       
                     StructField('Call Final Disposition', StringType(), True),       
                     StructField('Available DtTm', StringType(), True),       
                     StructField('Address', StringType(), True),       
                     StructField('City', StringType(), True),       
                     StructField('Zipcode of Incident', IntegerType(), True),       
                     StructField('Battalion', StringType(), True),                 
                     StructField('Station Area', StringType(), True),       
                     StructField('Box', StringType(), True),       
                     StructField('Original Priority', StringType(), True),       
                     StructField('Priority', StringType(), True),       
                     StructField('Final Priority', IntegerType(), True),       
                     StructField('ALS Unit', BooleanType(), True),       
                     StructField('Call Type Group', StringType(), True),
                     StructField('Number of Alarms', IntegerType(), True),
                     StructField('Unit Type', StringType(), True),
                     StructField('Unit sequence in call dispatch', IntegerType(), True),
                     StructField('Fire Prevention District', StringType(), True),
                     StructField('Supervisor District', StringType(), True),
                     StructField('Neighborhood District', StringType(), True),
                     StructField('Location', StringType(), True),
                     StructField('RowID', StringType(), True)])

# read the file using format CSV
sf_fire_file = "Fire_Department_Calls_for_Service.csv"
fire_df =spark.read.csv(sf_fire_file, header=True, mode="DROPMALFORMED", schema=fireschema)
CallType_df=(fire_df.select("Call Type")).groupBy("Call Type").count().sort('count',ascending=False)
CallType_df.show(5)

+-----------------+-------+
|        Call Type|  count|
+-----------------+-------+
| Medical Incident|3411696|
|   Structure Fire| 659644|
|           Alarms| 565194|
|Traffic Collision| 215006|
|            Other|  82489|
+-----------------+-------+
only showing top 5 rows



In [3]:
fire_df.count()

5225735

In [163]:
# How many different types of calls were made to the Fire Department?
fire_df.select('Call Type').distinct().show(20,False)

+--------------------------------------------+
|Call Type                                   |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Lightning Strike (Investigation)            |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire

In [169]:
# How many incidents of each call type were there?
CallType_df=(fire_df.select("Call Type")).groupBy("Call Type").count().sort('count',ascending=False)
CallType_df.show(5)


+-----------------+-------+
|        Call Type|  count|
+-----------------+-------+
| Medical Incident|3411696|
|   Structure Fire| 659644|
|           Alarms| 565194|
|Traffic Collision| 215006|
|            Other|  82489|
+-----------------+-------+
only showing top 5 rows



In [173]:
(fire_df.select('Call Type').groupBy('Call Type').count().sort("count", ascending=False)).show()

+--------------------+-------+
|           Call Type|  count|
+--------------------+-------+
|    Medical Incident|3411696|
|      Structure Fire| 659644|
|              Alarms| 565194|
|   Traffic Collision| 215006|
|               Other|  82489|
|Citizen Assist / ...|  78089|
|        Outside Fire|  62767|
|        Water Rescue|  25482|
|        Vehicle Fire|  24510|
|Gas Leak (Natural...|  20990|
|   Electrical Hazard|  15536|
|Elevator / Escala...|  14126|
|Odor (Strange / U...|  12811|
|Smoke Investigati...|  11646|
|          Fuel Spill|   5995|
|              HazMat|   4113|
|Industrial Accidents|   2967|
|           Explosion|   2747|
|  Aircraft Emergency|   1511|
|Train / Rail Inci...|   1389|
+--------------------+-------+
only showing top 20 rows



In [180]:
from functools import reduce

oldColumns = fire_df.schema.names
newColumns = ['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'ReceivedDtTm',
 'EntryDtTm',
 'DispatchDtTm',
 'ResponseDtTm',
 'OnSceneDtTm',
 'TransportDtTm',
 'HospitalDtTm',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'ZipcodeofIncident',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumberofAlarms',
 'UnitType',
 'Unitsequenceincalldispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'NeighborhoodDistrict',
 'Location',
 'RowID'            
]

df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], newColumns[idx]), range(len(oldColumns)), fire_df)


In [181]:
df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'ReceivedDtTm',
 'EntryDtTm',
 'DispatchDtTm',
 'ResponseDtTm',
 'OnSceneDtTm',
 'TransportDtTm',
 'HospitalDtTm',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'ZipcodeofIncident',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumberofAlarms',
 'UnitType',
 'Unitsequenceincalldispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'NeighborhoodDistrict',
 'Location',
 'RowID']

In [187]:
from_pattern1 = 'MM/dd/yyyy'
to_pattern1 = 'yyyy-MM-dd'

from_pattern2 = 'MM/dd/yyyy hh:mm:ss aa'
to_pattern2 = 'MM/dd/yyyy hh:mm:ss aa'

df_ts = df.withColumn('CallDateTS', unix_timestamp(df.CallDate, from_pattern1).cast("timestamp"))\
.withColumn("ReceivedDtTmTS", unix_timestamp(df.ReceivedDtTm, from_pattern2).cast("timestamp"))

In [196]:
df_ts = df.withColumn('Timestamp',from_unixtime(
    unix_timestamp(df.ReceivedDtTm, "yyyy-MM-dd'T'hh:mm:ss aa"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))

In [197]:
df_ts.select('Timestamp').show(2)

+---------+
|Timestamp|
+---------+
|     null|
|     null|
+---------+
only showing top 2 rows



In [198]:
from pyspark.sql import Row
df = sc.parallelize([Row(visit_dts='5/1/2018 3:48:14 PM')]).toDF()
import pyspark.sql.functions as f
web = df.withColumn("web_datetime", f.from_unixtime(f.unix_timestamp("visit_dts",'MM/dd/yyyy hh:mm:ss aa'),'MM/dd/yyyy HH:mm:ss'))

In [199]:
web.show()

+-------------------+------------+
|          visit_dts|web_datetime|
+-------------------+------------+
|5/1/2018 3:48:14 PM|        null|
+-------------------+------------+



In [188]:
df_ts.select("CallDateTs", "ReceivedDtTmTS").show(2)

+-------------------+--------------+
|         CallDateTs|ReceivedDtTmTS|
+-------------------+--------------+
|2019-07-25 00:00:00|          null|
|2019-07-25 00:00:00|          null|
+-------------------+--------------+
only showing top 2 rows



In [7]:
CallType_df2=(df.select("CallType")).groupBy("CallType").count().sort('count',ascending=False)
CallType_df2.show(5)

+-----------------+-------+
|         CallType|  count|
+-----------------+-------+
| Medical Incident|3411696|
|   Structure Fire| 659644|
|           Alarms| 565194|
|Traffic Collision| 215006|
|            Other|  82489|
+-----------------+-------+
only showing top 5 rows



In [8]:
few_fire_df = (df.select("IncidentNumber", "AvailableDtTm", "CallType") 
           .where(df.CallType != "Medical Incident"))

In [9]:
call_df=(few_fire_df.select("CallType")).groupBy("CallType").count().sort('count',ascending=False)

In [10]:
call_df.show(5)

+--------------------+------+
|            CallType| count|
+--------------------+------+
|      Structure Fire|659644|
|              Alarms|565194|
|   Traffic Collision|215006|
|               Other| 82489|
|Citizen Assist / ...| 78089|
+--------------------+------+
only showing top 5 rows



In [125]:
df.select("CallType").where(df.CallType != "null").distinct().show(10)

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|Confined Space / ...|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Lightning Strike ...|
|Citizen Assist / ...|
|              HazMat|
+--------------------+
only showing top 10 rows



In [11]:
df.select("CallType").where(df.CallType != "null").show(10)

+----------------+
|        CallType|
+----------------+
|  Structure Fire|
|  Structure Fire|
|  Structure Fire|
|  Structure Fire|
|  Structure Fire|
|  Structure Fire|
|Medical Incident|
|Medical Incident|
|          Alarms|
|          Alarms|
+----------------+
only showing top 10 rows



In [12]:
# Change Column name
df2 = df.withColumnRenamed("CallNumber", "NumberOfCalls")

In [14]:
df2.columns[0]

'NumberOfCalls'

In [16]:
df.select("CallDate").where(df.CallDate != "null").distinct().show(5)

+----------+
|  CallDate|
+----------+
|12/26/2004|
|06/26/2005|
|06/14/2010|
|03/02/2011|
|01/03/2012|
+----------+
only showing top 5 rows



In [18]:
df.select("ReceivedDtTm","ResponseDtTm" ).where(df.ResponseDtTm != "null").show(2,False)

+----------------------+----------------------+
|ReceivedDtTm          |ResponseDtTm          |
+----------------------+----------------------+
|07/25/2019 07:16:45 PM|07/25/2019 07:21:12 PM|
|07/25/2019 07:16:45 PM|07/25/2019 07:19:14 PM|
+----------------------+----------------------+
only showing top 2 rows



In [19]:
from pyspark.sql.functions import to_timestamp
 
fire_ts_df = (df.withColumn("IncidentDate", to_timestamp(df.CallDate, "MM/dd/yyyy"))
.withColumn("OnWatchDate", to_timestamp(df.WatchDate, "MM/dd/yyyy"))
.withColumn("AvailableDtTS", to_timestamp(df.AvailableDtTm, "MM/dd/yyyy hh:mm:ss aa")))

fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTS", "AvailableDtTm").distinct().show(3, False)

+-------------------+-------------------+-------------+----------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS|AvailableDtTm         |
+-------------------+-------------------+-------------+----------------------+
|2019-04-25 00:00:00|2019-04-25 00:00:00|null         |04/25/2019 03:28:56 PM|
|2019-04-25 00:00:00|2019-04-25 00:00:00|null         |04/25/2019 06:08:58 PM|
|2018-02-01 00:00:00|2018-02-01 00:00:00|null         |02/01/2018 12:38:05 PM|
+-------------------+-------------------+-------------+----------------------+
only showing top 3 rows



In [20]:
from pyspark.sql.functions import *
fire_ts_df.select(year('IncidentDate')).distinct().orderBy(year('IncidentDate')).show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
|              2019|
+------------------+
only showing top 20 rows



In [21]:
# What were the common types of fire calls?
fire_ts_df.select("CallType").where(col("CallType").isNotNull()).groupBy("CallType").count().orderBy(
    "count", ascending=False).show(10, truncate=False)       

+-------------------------------+-------+
|CallType                       |count  |
+-------------------------------+-------+
|Medical Incident               |3411696|
|Structure Fire                 |659644 |
|Alarms                         |565194 |
|Traffic Collision              |215006 |
|Other                          |82489  |
|Citizen Assist / Service Call  |78089  |
|Outside Fire                   |62767  |
|Water Rescue                   |25482  |
|Vehicle Fire                   |24510  |
|Gas Leak (Natural and LP Gases)|20990  |
+-------------------------------+-------+
only showing top 10 rows



In [153]:
fire_ts_df.select('ReceivedDtTm', 'ResponseDtTm').show(5,False)

+----------------------+----------------------+
|ReceivedDtTm          |ResponseDtTm          |
+----------------------+----------------------+
|07/25/2019 07:16:45 PM|07/25/2019 07:21:12 PM|
|07/25/2019 07:16:45 PM|07/25/2019 07:19:14 PM|
|07/25/2019 07:16:45 PM|07/25/2019 07:19:29 PM|
|07/25/2019 07:16:45 PM|07/25/2019 07:20:30 PM|
|07/25/2019 07:16:45 PM|07/25/2019 07:19:32 PM|
+----------------------+----------------------+
only showing top 5 rows



In [24]:
# Need to fix the issue with "null" timestamp

from pyspark.sql import Row
df = sc.parallelize([Row(visit_dts='5/1/2018 3:48:14 PM')]).toDF()
import pyspark.sql.functions as f
web = df.withColumn("web_datetime", 
                    f.from_unixtime(f.unix_timestamp("visit_dts",'MM/dd/yyyy hh:mm:ss aa'),'MM/dd/yyyy HH:mm:ss'))


In [25]:
web.show()

+-------------------+------------+
|          visit_dts|web_datetime|
+-------------------+------------+
|5/1/2018 3:48:14 PM|        null|
+-------------------+------------+



In [26]:
fire_ts_df.select(sum("NumberOfAlarms")).show()         

+-------------------+
|sum(NumberOfAlarms)|
+-------------------+
|            5253779|
+-------------------+



In [39]:
# What were all the different types of fire calls in 2018?
calltype_df_2018=(fire_ts_df.select("CallType").where(year(fire_ts_df.IncidentDate) == 2018)).distinct()
calltype_df_2018.show(calltype_df_2018.count(),False)  


+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Police                               |
|Gas Leak (Na

In [51]:
# What were all the different types of fire calls in 2019?
calltype_df_2019=(fire_ts_df.select("CallType").where(year(fire_ts_df.IncidentDate) == 2019)).distinct()
calltype_df_2019.show(calltype_df_2019.count(),False)  

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Train / Rail Fire                           |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Police                               |
|Gas Leak (Na

In [89]:
# Q1: What months within the year 2019 saw for the highest number of fire calls?

# Adding a new column "Month" to the new datafram fire_ts_m_df
fire_ts_m_df = fire_ts_df.withColumn("Month", month(fire_ts_df.IncidentDate))

# Show the month in the year 2019 saw the highest nuber of fire calls
(fire_ts_m_df.select("Month","CallNumber").where(year(fire_ts_df.IncidentDate) == 2019)
   .groupby("Month").count().orderBy("count",ascending=False )).show(1)

+-----+-----+
|Month|count|
+-----+-----+
|   10|29209|
+-----+-----+
only showing top 1 row



In [90]:
# Show the month in the year 2018 saw the highest nuber of fire calls
(fire_ts_m_df.select("Month","CallNumber").where(year(fire_ts_df.IncidentDate) == 2018)
   .groupby("Month").count().orderBy("count",ascending=False )).show(1)

+-----+-----+
|Month|count|
+-----+-----+
|    1|27027|
+-----+-----+
only showing top 1 row



In [91]:
# Show the month in the year 2017 saw the highest nuber of fire calls
(fire_ts_m_df.select("Month","CallNumber").where(year(fire_ts_df.IncidentDate) == 2017)
   .groupby("Month").count().orderBy("count",ascending=False )).show(1)

+-----+-----+
|Month|count|
+-----+-----+
|    1|28026|
+-----+-----+
only showing top 1 row



In [113]:
# Q2 Which neighborhood in SF generated the most fire calls in 2019?

(fire_ts_df.select("NeighborhoodDistrict").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "NeighborhoodDistrict").count().orderBy("count",ascending=False ).show(1,False)


+--------------------+-----+
|NeighborhoodDistrict|count|
+--------------------+-----+
|Tenderloin          |45852|
+--------------------+-----+
only showing top 1 row



In [111]:
(fire_ts_df.select("NeighborhoodDistrict").where(year(fire_ts_df.IncidentDate) == 2011)).groupby(
    "NeighborhoodDistrict").count().orderBy("count",ascending=False ).show(5,False)

+------------------------------+-----+
|NeighborhoodDistrict          |count|
+------------------------------+-----+
|Tenderloin                    |35897|
|South of Market               |28645|
|Mission                       |25205|
|Financial District/South Beach|16343|
|Bayview Hunters Point         |13284|
+------------------------------+-----+
only showing top 5 rows



In [110]:
(fire_ts_df.select("NeighborhoodDistrict").where(year(fire_ts_df.IncidentDate) == 2000)).groupby(
    "NeighborhoodDistrict").count().orderBy("count",ascending=False ).show(5,False)

+------------------------------+-----+
|NeighborhoodDistrict          |count|
+------------------------------+-----+
|Tenderloin                    |20063|
|Mission                       |14563|
|South of Market               |13830|
|Financial District/South Beach|12880|
|Bayview Hunters Point         |10175|
+------------------------------+-----+
only showing top 5 rows



In [None]:
# Q3 Which neighborhoods in SF had the worst response time to fire calls in 2018?
# need to fix the null issue of timestamp to answer this question

In [124]:
# Q4 Which week in the year in 2019 had the most fire calls?

# Adding a new column "Week" to the new datafram fire_ts_m_w_df
fire_ts_m_w_df = fire_ts_m_df.withColumn("Week", weekofyear(fire_ts_m_df.IncidentDate))

(fire_ts_m_w_df.select("Week").where(year(fire_ts_m_w_df.IncidentDate) == 2019)
   .groupby("Week").count().orderBy("count",ascending=False )).show(1)


+----+-----+
|Week|count|
+----+-----+
|  43| 7311|
+----+-----+
only showing top 1 row



In [125]:
(fire_ts_m_w_df.select("Week").where(year(fire_ts_m_w_df.IncidentDate) == 2000)
   .groupby("Week").count().orderBy("count",ascending=False )).show(1)


+----+-----+
|Week|count|
+----+-----+
|  24| 4757|
+----+-----+
only showing top 1 row



In [126]:
(fire_ts_m_w_df.select("Week").where(year(fire_ts_m_w_df.IncidentDate) == 2018)
   .groupby("Week").count().orderBy("count",ascending=False )).show(1)

+----+-----+
|Week|count|
+----+-----+
|   1| 7545|
+----+-----+
only showing top 1 row



In [143]:
# Q5 Is there a correlation between neighborhood, zip code, and fire calls?

(fire_ts_df.select("NeighborhoodDistrict","ZipcodeofIncident").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "NeighborhoodDistrict", "ZipcodeofIncident").count().orderBy("count",ascending=False ).show(5,False)


+---------------------+-----------------+-----+
|NeighborhoodDistrict |ZipcodeofIncident|count|
+---------------------+-----------------+-----+
|Tenderloin           |94102            |34027|
|South of Market      |94103            |29156|
|Mission              |94110            |17538|
|Bayview Hunters Point|94124            |16013|
|Mission              |94103            |11168|
+---------------------+-----------------+-----+
only showing top 5 rows



In [144]:
(fire_ts_df.select("NeighborhoodDistrict").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "NeighborhoodDistrict").count().orderBy("count",ascending=False ).show(5,False)

+------------------------------+-----+
|NeighborhoodDistrict          |count|
+------------------------------+-----+
|Tenderloin                    |45852|
|South of Market               |35007|
|Mission                       |28763|
|Financial District/South Beach|22652|
|Bayview Hunters Point         |17008|
+------------------------------+-----+
only showing top 5 rows



In [145]:
(fire_ts_df.select("ZipcodeofIncident").where(year(fire_ts_df.IncidentDate) == 2019)).groupby(
    "ZipcodeofIncident").count().orderBy("count",ascending=False ).show(5,False)

+-----------------+-----+
|ZipcodeofIncident|count|
+-----------------+-----+
|94103            |43506|
|94102            |42591|
|94109            |28257|
|94110            |24592|
|94124            |16444|
+-----------------+-----+
only showing top 5 rows



In [142]:
(fire_ts_df.select("ZipcodeofIncident","NeighborhoodDistrict").where(
    fire_ts_df.NeighborhoodDistrict =="Tenderloin")).distinct().show()

+-----------------+--------------------+
|ZipcodeofIncident|NeighborhoodDistrict|
+-----------------+--------------------+
|            94102|          Tenderloin|
|            94109|          Tenderloin|
|            94103|          Tenderloin|
+-----------------+--------------------+



In [141]:
(fire_ts_df.select("NeighborhoodDistrict","ZipcodeofIncident").where(
    fire_ts_df.ZipcodeofIncident == 94102)).distinct().show()

+--------------------+-----------------+
|NeighborhoodDistrict|ZipcodeofIncident|
+--------------------+-----------------+
|    Western Addition|            94102|
|          Tenderloin|            94102|
|            Nob Hill|            94102|
|             Mission|            94102|
|     South of Market|            94102|
|Financial Distric...|            94102|
|        Hayes Valley|            94102|
+--------------------+-----------------+

