# Final Project: San Francisco Fire Department Service Calls

### By: Jared Yu, Tiffany Chen, Emily Watkins

In [1]:
import datetime as dt
import pandas as pd
from sodapy import Socrata

In [2]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.expand_frame_repr", True)

In [3]:
def api_query(SoQL_query):
    """
    https://dev.socrata.com/foundry/data.sfgov.org/enhu-st7v
    The function is created from code given to query data off the
    API of the government website. Personal login information
    is used to access the API. Additionally, a SoQL query is
    used as an input to determine how the data should be accessed.
    
    Args: SoQL_query, a SQL-like query using SocrataQL
    Return: queried dataset in dataframe format
    """
    client = Socrata(domain='data.sfgov.org', 
                     app_token='LatHs7KifDEpxpxnlKMb9SFfy', 
                     username="qzyu999@gmail.com", 
                     password="SFData999")

    result_list = client.get("enhu-st7v", limit=5000000, where=SoQL_query)

    return(pd.DataFrame.from_records(result_list))

In [7]:
def get_time_of_day(hour):
    """
    Takes in an int that indicates hour (0-24), and uses that to determine time of day (e.g. 7 is "morning").
    
    Args: 'hour' (hour of day, int)
    Return: 'tod' (time of day, string)
    """
    # By Hour Logic: 0-4 night; 5-11 morning; 12-16 afternoon; 17-19 evening; 20-24 night
    if hour in range(5, 11+1):
        tod = "morning"
    elif hour in range(12, 16+1):
        tod = "afternoon"
    elif hour in range(17, 19+1):
        tod = "evening"
    else:
        tod = "night"
    
    return tod

In [8]:
def get_season(month):
    """
    Takes in an int that indicates the month (1-12), and uses that to determine season (e.g. "winter").
    
    Args: 'month' (month of year, int)
    Return: 'season' (season of year, string)
    """
    # Source: https://www.timeanddate.com/calendar/aboutseasons.html
    # Meteorological Season Definition (e.g. Spring is March 1 - May 31)
    if month in range(3, 5+1):
        season = "Spring"
    elif month in range(6, 8+1):
        season = "Summer"
    elif month in range(9, 11+1):
        season = "Fall"
    else:
        season = "Winter"
    
    return season

In [9]:
def clean_my_data(df, col_list):
    """
    Takes in the dataframe that needs to be cleaned. Also takes in the list of column names of
    complicated and long dates, for proper date conversion.
    
    Args: 'df' (the dataframe to be cleaned)
          'col_list' (the list of complicated datetime column names)
    Return: 'df' (the dataframe already cleaned)
    """
    # Simple data type conversions to int
    df["number_of_alarms"] = df["number_of_alarms"].astype(int)
    df["unit_sequence_in_call_dispatch"] = df["unit_sequence_in_call_dispatch"].astype(int)
    
    # Simple datetime conversions
    df["call_date"] = pd.to_datetime(df["call_date"], format="%Y-%m-%d")
    df["watch_date"] = pd.to_datetime(df["watch_date"], format="%Y-%m-%d")
    
    # For every value in the provided complicated datetime column list, change data types to datetimes
    for val in col_list:
        df[val] = pd.to_datetime(df[val], format="%Y-%m-%dT%H:%M:%S.%f")
        
    # Day of The Week, Time of Day, and Season columns
    # Source: https://stackoverflow.com/questions/25146121/extracting-just-month-and-year-from-pandas-datetime-column-python
    df["year"] = pd.DatetimeIndex(df["call_date"]).year
    df["month"] = pd.DatetimeIndex(df["call_date"]).month
    df["dotw"] = df["call_date"].dt.day_name()
    df["hour"] = pd.DatetimeIndex(df["received_dttm"]).hour
    df["time_of_day"] = df["hour"].apply(lambda row: get_time_of_day(row))
    df["season"] = df["month"].apply(lambda row: get_season(row))
    
    # Lat/Long
    df["lat"] = [i["coordinates"][0] for i in df["location"]]
    df["long"] = [i["coordinates"][1] for i in df["location"]]
    
    return df

In [7]:
def get_clean_query(SoQL_query_timeframe,
                    date_and_time_col_list,
                    lodate=0,
                    hidate=0):
    """
    Queries data from API of the San Francisco government website, using a specified timeframe.
    Cleans the data, and returns the clean dataframe. Also returns a date-subsetted dataframe
    depending on user choice.
    
    Args: 'SoQL_query_timeframe' (a SQL-like query using SocrataQL),
          'date_and_time_col_list' (the list of complicated datetime column names),
          'lodate' (optional field, subsets by lowest date)
          'hidate' (optional field, subsets by highest date)
    Returns:
    """
    # Query according the timeframe that user specifies
    df_query = api_query(SoQL_query_timeframe)
    # Clean all datatypes, add columns, etc.
    clean_df = clean_my_data(df=df_query, col_list=date_and_time_col_list)
    
    # Subset df by date if desired
    if (lodate == 0) & (hidate == 0):
        date_subset_df = "Not Specified"
    else:
        if lodate != 0:
            date_subset_df = clean_df[clean_df["watch_date"] >= lodate]
        if hidate !=0:
            date_subset_df = clean_df[clean_df["watch_date"] <= hidate]
    
    
    # Return the original df (all dates) and the date subsetted df
    return clean_df, date_subset_df

In [15]:
post_2015 = 'watch_date>="2015-01-01T00:00:00"'
date_column_list = ["received_dttm", "entry_dttm", "dispatch_dttm", "response_dttm",
                    "on_scene_dttm", "transport_dttm", "hospital_dttm", "available_dttm"]
low_date = "2015-01-01"
high_date = "2018-11-20"

In [None]:
df_2015, df1 = get_clean_query(SoQL_query_timeframe=post_2015,
                               date_and_time_col_list=date_column_list,
                               lodate=low_date,
                               hidate=high_date)

In [69]:
df1.dtypes

address                                      object
als_unit                                       bool
available_dttm                       datetime64[ns]
battalion                                    object
box                                          object
call_date                            datetime64[ns]
call_final_disposition                       object
call_number                                  object
call_type                                    object
call_type_group                              object
city                                         object
dispatch_dttm                        datetime64[ns]
entry_dttm                           datetime64[ns]
final_priority                               object
fire_prevention_district                     object
hospital_dttm                        datetime64[ns]
incident_number                              object
location                                     object
neighborhoods_analysis_boundaries            object
number_of_al

In [70]:
# This list makes a lot of sense, not every incident requires a hospital visit, transportation, etc.
# Unfortunately we are missing a lot of on-scene date times though???
df1.isna().sum().sort_values(ascending=False).head(11)

hospital_dttm          859794
transport_dttm         852352
on_scene_dttm          237318
response_dttm           32153
city                     2921
station_area             2213
zipcode_of_incident      1525
call_type_group           848
available_dttm            250
original_priority          99
box                        53
dtype: int64

In [71]:
df1["watch_date"].max()

Timestamp('2018-11-20 00:00:00')

In [72]:
df1["time_of_day"].unique()

array(['morning', 'evening', 'night', 'afternoon'], dtype=object)

In [73]:
df1.sample()

Unnamed: 0,address,als_unit,available_dttm,battalion,box,call_date,call_final_disposition,call_number,call_type,call_type_group,city,dispatch_dttm,entry_dttm,final_priority,fire_prevention_district,hospital_dttm,incident_number,location,neighborhoods_analysis_boundaries,number_of_alarms,on_scene_dttm,original_priority,priority,received_dttm,response_dttm,rowid,station_area,supervisor_district,transport_dttm,unit_id,unit_sequence_in_call_dispatch,unit_type,watch_date,zipcode_of_incident,year,month,dotw,hour,time_of_day,lat,long
395941,POTRERO AV/CESAR CHAVEZ ST,True,2016-05-01 16:47:42,B10,2624,2016-05-01,Code 2 Transport,161222673,Medical Incident,Non Life-threatening,San Francisco,2016-05-01 16:37:58,2016-05-01 16:37:38,2,10,NaT,16048482,"{'type': 'Point', 'coordinates': [-122.4050073...",Bernal Heights,1,2016-05-01 16:41:21,2,2,2016-05-01 16:36:53,2016-05-01 16:39:33,161222673-E09,9,9,NaT,E09,2,ENGINE,2016-05-01,94110,2016,5,Sunday,16,afternoon,-122.405007,37.749006
