# Final Project: San Francisco Fire Department Service Calls

### By: Jared Yu, Tiffany Chen, Emily Watkins

In [1]:
import datetime as dt
import holidays
import pandas as pd
from sodapy import Socrata

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.expand_frame_repr", True)

In [3]:
def api_query(SoQL_query):
    """
    https://dev.socrata.com/foundry/data.sfgov.org/enhu-st7v
    The function is created from code given to query data off the
    API of the government website. Personal login information
    is used to access the API. Additionally, a SoQL query is
    used as an input to determine how the data should be accessed.
    
    Args: SoQL_query, a SQL-like query using SocrataQL
    Return: queried dataset in dataframe format
    """
    client = Socrata(domain="data.sfgov.org",
                     app_token="LatHs7KifDEpxpxnlKMb9SFfy",
                     username="qzyu999@gmail.com",
                     password="SFData999")

    result_list = client.get("enhu-st7v", limit=5000000, where=SoQL_query)

    return(pd.DataFrame.from_records(result_list))

In [4]:
def get_time_of_day(hour):
    """
    Takes in an int that indicates hour (0-24), and uses that to determine time of day (e.g. 7 is "morning").
    
    Args: 'hour' (hour of day, int)
    Return: 'tod' (time of day, string)
    """
    # By Hour Logic: 0-4 night; 5-11 morning; 12-16 afternoon; 17-19 evening; 20-24 night
    if hour in range(5, 11+1):
        tod = "morning"
    elif hour in range(12, 16+1):
        tod = "afternoon"
    elif hour in range(17, 19+1):
        tod = "evening"
    else:
        tod = "night"
    
    return tod

In [5]:
def get_season(month):
    """
    Takes in an int that indicates the month (1-12), and uses that to determine season (e.g. "winter").
    
    Args: 'month' (month of year, int)
    Return: 'season' (season of year, string)
    """
    # Source: https://www.timeanddate.com/calendar/aboutseasons.html
    # Meteorological Season Definition (e.g. Spring is March 1 - May 31)
    if month in range(3, 5+1):
        season = "Spring"
    elif month in range(6, 8+1):
        season = "Summer"
    elif month in range(9, 11+1):
        season = "Fall"
    else:
        season = "Winter"
    
    return season

In [1]:
def clean_my_data(df, col_list):
    """
    Takes in the dataframe that needs to be cleaned. Also takes in the list of column names of
    complicated and long dates, for proper date conversion.
    
    Args: 'df' (the dataframe to be cleaned)
          'col_list' (the list of complicated datetime column names)
    Return: 'df' (the dataframe already cleaned)
    """
    # Simple data type conversions to int
    df["number_of_alarms"] = df["number_of_alarms"].astype(int)
    df["unit_sequence_in_call_dispatch"] = df["unit_sequence_in_call_dispatch"].astype(int)
    
    # Simple datetime conversions
    df["call_date"] = pd.to_datetime(df["call_date"], format="%Y-%m-%d")
    df["watch_date"] = pd.to_datetime(df["watch_date"], format="%Y-%m-%d")
    
    # For every value in the provided COMPLICATED datetime column list, change data types to datetimes
    for val in col_list:
        df[val] = pd.to_datetime(df[val], format="%Y-%m-%dT%H:%M:%S.%f")
        
    # Day of The Week, Time of Day, and Season columns
    # Source: https://stackoverflow.com/questions/25146121/extracting-just-month-and-year-from-pandas-datetime-column-python
    df["year"] = pd.DatetimeIndex(df["call_date"]).year
    df["month"] = pd.DatetimeIndex(df["call_date"]).month
    df["dotw"] = df["call_date"].dt.day_name()
    df["hour"] = pd.DatetimeIndex(df["received_dttm"]).hour
    df["time_of_day"] = df["hour"].apply(lambda row: get_time_of_day(row))
    df["season"] = df["month"].apply(lambda row: get_season(row))
    
    # Holidays
    # Source: https://pypi.org/project/holidays/
    us_holidays = holidays.US()
    
    
    # Response Time (On-Scene timestamp minus Full Response Acknowledge timestamp)
    df["total_resp_time"] = df["on_scene_dttm"] - df["response_dttm"]
    
    # Lat/Long
    df["lat"] = [i["coordinates"][0] for i in df["location"]]
    df["long"] = [i["coordinates"][1] for i in df["location"]]
    
    return df

In [7]:
def get_clean_query(SoQL_query_timeframe,
                    date_and_time_col_list,
                    lodate=0,
                    hidate=0):
    """
    Queries data from API of the San Francisco government website, using a specified timeframe.
    Cleans the data, and returns the clean dataframe. Also returns a date-subsetted dataframe
    depending on user choice.
    
    Args: 'SoQL_query_timeframe' (a SQL-like query using SocrataQL),
          'date_and_time_col_list' (the list of complicated datetime column names),
          'lodate' (optional field, subsets by lowest date)
          'hidate' (optional field, subsets by highest date)
    Returns:
    """
    # Query according the timeframe that user specifies
    df_query = api_query(SoQL_query_timeframe)
    # Clean all datatypes, add columns, etc.
    clean_df = clean_my_data(df=df_query, col_list=date_and_time_col_list)
    
    # Subset df by date if desired
    if (lodate == 0) & (hidate == 0):
        date_subset_df = "Not Specified"
    else:
        if lodate != 0:
            date_subset_df = clean_df[clean_df["watch_date"] >= lodate]
        if hidate !=0:
            date_subset_df = clean_df[clean_df["watch_date"] <= hidate]
    
    
    # Return the original df (all dates) and the date subsetted df
    return clean_df, date_subset_df

### Variable Dictionary
__Source:__ https://stackoverflow.com/questions/35077507/how-to-right-align-and-justify-align-in-markdown

__Note:__ Markdown does not support text left-alignment, but HTML does!

|Variable | Description
|:---: | :---
|Call Number | A unique 9-digit number assigned by the 911 Dispatch Center (DEM) to this call. These number are used for both Police and Fire calls.
|Unit ID | N/A |
|Incident Number | A unique 8-digit number assigned by DEM to this Fire incident.
|Call Type | Call Type
|Call Date | Date the call is received at the 911 Dispatch Center. Used for reporting purposes.
|Watch Date | Watch date when the call is received. Watch date starts at 0800 each morning and ends at 0800 the next day.
|Received DtTm | Date and time of call is received at the 911 Dispatch Center.
|Entry DtTm | Date and time the 911 operator submits the entry of the initical call information into the CAD system.
|Dispatch DtTm | Date and time the 911 operator dispatches this unit to the call.
|Response DtTm | Date and time this unit acknowledges the dispatch and records that the unit is en route to the location of the call.
|On Scene DtTm | Date and time the unit records arriving to the location of the incident.
|Transport DtTm | If this unit is an ambulance, date and time the unit begins the transport unit arrives to hospital.
|Hospital DtTm | If this unit is an ambulance, date and time the unit arrives to the hospital.
|Call Final Disposition | Disposition of the call (Code). For example TH2: Transport to Hospital  Code 2, FIR: Resolved by Fire Department.
|Available DtTm | Date and time this unit is not longer assigned to this call and it is available for another dispatch.
|Address | Address of incident (note: address and location generalized to mid block of street, intersection or nearest call box location, to protect caller privacy). 
|City | City of incident.
|Zipcode of Incident | Zipcode of incident.
|Battalion | Emergency Response District (There are 9 Fire Emergency Response Districts).
|Station Area | Fire Station First Response Area associated with the address of the incident .
|Box | Fire box associated with the address of the incident. A box is the smallest area used to divide the City. Each box is associated with a unique unit dispatch order. The City is divided into more than 2,400 boxes.
|Original Priority | Initial call priority (Code 2: Non-Emergency or Code 3:Emergency).
|Priority | Call priority (Code 2: Non-Emergency or Code 3:Emergency).
|Final Priority | Final call priority (Code 2: Non-Emergency or Code 3:Emergency).
|ALS Unit | Does this unit includes ALS (Advance Life Support) resources? Is there a paramedic in this unit?
|Call Type Group | Call types are divided into four main groups: Fire, Alarm, Potential Life Threatening and Non Life Threatening.
|Number of Alarms | Number of alarms associated with the incident. This is a number between 1 and 5.
|Unit Type | Unit type.
|Unit sequence in call dispatch | A number that indicates the order this unit was assigned to this call.
|Fire Prevention District | Bureau of Fire Prevention District associated with this address.
|Supervisor District | Supervisor District associated with this address (note: these are the districts created in 2012).
|Neighborhooods | Analysis Boundaries - Neighborhood District associated with this address.
|Location | Location of incident (note: address and location generalized to mid block of street, intersection or nearest call box location, to protect caller privacy).
|RowID | Unique Call Number and Unit ID combination.

### Querying the Data

In [8]:
post_2015 = 'watch_date>="2015-01-01T00:00:00"'
date_column_list = ["received_dttm", "entry_dttm", "dispatch_dttm", "response_dttm",
                    "on_scene_dttm", "transport_dttm", "hospital_dttm", "available_dttm"]
low_date = "2015-01-01"
high_date = "2018-11-20" # I chose any old random date

In [9]:
df_2015, df1 = get_clean_query(SoQL_query_timeframe=post_2015,
                               date_and_time_col_list=date_column_list,
                               lodate=low_date,
                               hidate=high_date)

print("Done querying!")  # just for me

MemoryError: 

In [25]:
df1.dtypes

address                                       object
als_unit                                        bool
available_dttm                        datetime64[ns]
battalion                                     object
box                                           object
call_date                             datetime64[ns]
call_final_disposition                        object
call_number                                   object
call_type                                     object
call_type_group                               object
city                                          object
dispatch_dttm                         datetime64[ns]
entry_dttm                            datetime64[ns]
final_priority                                object
fire_prevention_district                      object
hospital_dttm                         datetime64[ns]
incident_number                               object
location                                      object
neighborhoods_analysis_boundaries             

In [11]:
# This list makes a lot of sense, not every incident requires a hospital visit, transportation, etc.
# Unfortunately we are missing a lot of on-scene date times though???
df1.isna().sum().sort_values(ascending=False).head(11)

hospital_dttm          859794
transport_dttm         852352
on_scene_dttm          237318
response_dttm           32153
city                     2921
station_area             2213
zipcode_of_incident      1525
call_type_group           848
available_dttm            250
original_priority          99
box                        53
dtype: int64

In [27]:
df1.sample()

Unnamed: 0,address,als_unit,available_dttm,battalion,box,call_date,call_final_disposition,call_number,call_type,call_type_group,city,dispatch_dttm,entry_dttm,final_priority,fire_prevention_district,hospital_dttm,incident_number,location,neighborhoods_analysis_boundaries,number_of_alarms,on_scene_dttm,original_priority,priority,received_dttm,response_dttm,rowid,station_area,supervisor_district,transport_dttm,unit_id,unit_sequence_in_call_dispatch,unit_type,watch_date,zipcode_of_incident,year,month,dotw,hour,time_of_day,season,total_resp_time,lat,long
406533,900 Block of STOCKTON ST,True,2016-05-14 17:53:34,B01,1331,2016-05-14,Code 3 Transport,161352458,Medical Incident,Potentially Life-Threatening,San Francisco,2016-05-14 16:46:56,2016-05-14 16:46:34,3,1,NaT,16053409,"{'type': 'Point', 'coordinates': [-122.4079228...",Chinatown,1,2016-05-14 16:50:22,3,E,2016-05-14 16:45:24,2016-05-14 16:48:29,161352458-E02,2,3,NaT,E02,1,ENGINE,2016-05-14,94108,2016,5,Saturday,16,afternoon,Spring,00:01:53,-122.407923,37.794509


In [21]:
# Response Time Calculation -- GRAPH IT
# Source?: https://stackoverflow.com/questions/4090383/plotting-unix-timestamps-in-matplotlib

In [2]:
# Holiday Testing
us_holidays = holidays.US()

In [5]:
dt.date(2014, 1, 1) in us_holidays

True

# ASK SHARPNACK ABOUT EXCEPTIONS. I HAVEN'T REALLY USED THEM MUCH!!! IS THIS EVEN HOW EXCEPTIONS WORK?

In [9]:
def get_US_holidays(date, desired_state, year_list):
    US_holiday_dict = holidays.US(state=desired_state, years=year_list)
    
    try:
        holiday = US_holiday_dict[date]
    except:
        holiday = "Not a Holiday"
        
    return holiday

2015-01-01 New Year's Day
2015-01-19 Martin Luther King, Jr. Day
2015-02-15 Susan B. Anthony Day
2015-02-16 Washington's Birthday
2015-03-31 César Chávez Day
2015-05-25 Memorial Day
2015-07-03 Independence Day (Observed)
2015-07-04 Independence Day
2015-09-07 Labor Day
2015-10-12 Columbus Day
2015-11-11 Veterans Day
2015-11-26 Thanksgiving
2015-12-25 Christmas Day
2016-01-01 New Year's Day
2016-01-18 Martin Luther King, Jr. Day
2016-02-15 Washington's Birthday, Susan B. Anthony Day
2016-03-31 César Chávez Day
2016-05-30 Memorial Day
2016-07-04 Independence Day
2016-09-05 Labor Day
2016-10-10 Columbus Day
2016-11-11 Veterans Day
2016-11-24 Thanksgiving
2016-12-25 Christmas Day
2016-12-26 Christmas Day (Observed)
2017-01-01 New Year's Day
2017-01-02 New Year's Day (Observed)
2017-01-16 Martin Luther King, Jr. Day
2017-02-15 Susan B. Anthony Day
2017-02-20 Washington's Birthday
2017-03-31 César Chávez Day
2017-05-29 Memorial Day
2017-07-04 Independence Day
2017-09-04 Labor Day
2017-10-09 

In [11]:
year_list = [2015, 2016, 2017, 2018]
desired_state = "CA"

In [19]:
US_holiday_dict = holidays.US(state=desired_state, years=year_list)
US_holiday_dict

{datetime.date(2016, 1, 1): "New Year's Day",
 datetime.date(2016, 1, 18): 'Martin Luther King, Jr. Day',
 datetime.date(2016, 2, 15): "Washington's Birthday, Susan B. Anthony Day",
 datetime.date(2016, 3, 31): 'César Chávez Day',
 datetime.date(2016, 5, 30): 'Memorial Day',
 datetime.date(2016, 7, 4): 'Independence Day',
 datetime.date(2016, 9, 5): 'Labor Day',
 datetime.date(2016, 10, 10): 'Columbus Day',
 datetime.date(2016, 11, 11): 'Veterans Day',
 datetime.date(2016, 11, 24): 'Thanksgiving',
 datetime.date(2016, 12, 25): 'Christmas Day',
 datetime.date(2016, 12, 26): 'Christmas Day (Observed)',
 datetime.date(2017, 1, 1): "New Year's Day",
 datetime.date(2017, 1, 2): "New Year's Day (Observed)",
 datetime.date(2017, 1, 16): 'Martin Luther King, Jr. Day',
 datetime.date(2017, 2, 15): 'Susan B. Anthony Day',
 datetime.date(2017, 2, 20): "Washington's Birthday",
 datetime.date(2017, 3, 31): 'César Chávez Day',
 datetime.date(2017, 5, 29): 'Memorial Day',
 datetime.date(2017, 7, 4): 

In [22]:
US_holiday_dict[dt.date(2016, 1, 1)]

"New Year's Day"

In [23]:
US_holiday_dict[dt.date(2016, 1, 2)]

KeyError: datetime.date(2016, 1, 2)