In [37]:
import os
from urllib import request 
import joblib
import pandas as pd
from collections import defaultdict

## Create a folder to save data files.

In [18]:
if not os.path.isdir('data'):
    os.makedirs('data')

In [20]:
def download_data(name_dir="data", filename="data.csv",
                  data_url = ('https://www2.census.gov/'
                              'programs-surveys/popest/datasets/'
                              '2010-2017/cities/totals/sub-est2017_42.csv')):
    """ 
    Download both train and test dataset to a directory named "data". 

    If the directory doesn't exist, this function will create one. 

    Parameters
    ----------
    data_url : str, default 'https://www2.census.gov/.../sub-est2017_42.csv'
        The URL of the location of the file.
    filename : str, default 'data.csv'
    name_dir : str, default 'data'
        Name of the directory.
    
    Returns
    -------
    No return. Create a directory if needed and 
    download the file to that directory.
    """
    
    # if the directory doesn't exist, make a directory.
    if not os.path.isdir(name_dir):
        os.makedirs(name_dir)

    data_path = os.path.join(name_dir, filename)
    
    # if file doesn't exist, download it.
    if not os.path.isfile(data_path):
        request.urlretrieve(data_url,data_path)
    

def save_obj(obj, name, name_dir='data' ):
    """
    Save to pickle.
    
    Parameters
    ----------
    obj : any object 
        This can be a dictionary or ndarray.
    name : str
        The name for the object to be saved.
    name_dir : str, default 'data'
        Name of the directory.
    
    Returns
    -------
    No return.
        Save the pickle object to the local file system.
    """
    
    if not os.path.isdir(name_dir):
        os.makedirs(name_dir)

    data_path = os.path.join(name_dir, name+'.pkl')
    
    with open(data_path, 'wb') as f:
        joblib.dump(obj, f)

def load_obj(name, name_dir='data' ):
    """
    Load the pickle object from the local file system.
    
    Parameters
    ----------
    name : str
        The name for the object to be saved.
    name_dir : str, default 'data'
        Name of the directory.
    
    Returns
    -------
    object
        Return an object such as a dictionary.
    """
    data_path = os.path.join(name_dir, name+'.pkl')
    
    with open(data_path, 'rb') as f:
        return joblib.load(f)

## Download EMS dispatch data

In [21]:
url_ems = 'https://data.wprdc.org/dataset/abba9671-a026-4270-9c83-003a1414d628/resource/ff33ca18-2e0c-4cb5-bdcd-60a5dc3c0418/download/911-ems-dispatches.csv'

In [22]:
download_data(name_dir="data", 
              filename="EMS.csv",
              data_url=url_ems)

In [23]:
df_ems = pd.read_csv('data/EMS.csv')
df_ems.head()


Unnamed: 0,Call_ID_Hash,SERVICE,PRIORITY,PRIORITY_DESC,AGENCY,CALL_QUARTER,CALL_YEAR,DESCRIPTION_SHORT,CITY_CODE,CITY_NAME,GEOID
0,336ba345f5a0,EMS,E0,EMS ALS life threatening response w/ backup,E620,Q1,2015,HEMORRHAGE,PVU,PORT VUE,420035003002
1,f59d50d394fa,EMS,E0,EMS ALS life threatening response w/ backup,E380,Q1,2015,UNCONSCIOUS,PEN,PENN HILLS,420035235011
2,8d56b4f6f9fe,EMS,E0,EMS ALS life threatening response w/ backup,E620,Q1,2015,UNCONSCIOUS,MCK,MCKEESPORT,420035523003
3,e20045d2f6b4,EMS,E0,EMS ALS life threatening response w/ backup,E050,Q1,2015,"GUNSHOT, STABBING, OR OTHER WOUND",PGH,PITTSBURGH,420031306003
4,dbe1c9db46f6,EMS,E0,EMS ALS life threatening response w/ backup,E050,Q1,2015,ASSAULT,PGH,PITTSBURGH,420031609002


## Check Missing Values

In [24]:
df_ems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780698 entries, 0 to 780697
Data columns (total 11 columns):
Call_ID_Hash         780698 non-null object
SERVICE              780698 non-null object
PRIORITY             780698 non-null object
PRIORITY_DESC        780698 non-null object
AGENCY               780698 non-null object
CALL_QUARTER         780698 non-null object
CALL_YEAR            780698 non-null int64
DESCRIPTION_SHORT    778103 non-null object
CITY_CODE            780634 non-null object
CITY_NAME            780611 non-null object
GEOID                780698 non-null int64
dtypes: int64(2), object(9)
memory usage: 65.5+ MB


In [25]:
def getCodeName(geoid,df):
    """
    Return city name and city code for a given geioid.
    
    Parameters
    ----------
    geoid : int
    df : DataFrame
    
    Returns
    -------
    tuple
    """
    temp = df[(df_ems.GEOID == geoid) & (df.CITY_NAME.notnull()) ][['CITY_CODE','CITY_NAME']]
    city_name = temp['CITY_NAME'].value_counts().index[0]
    city_code = temp[ temp['CITY_NAME'] == city_name]['CITY_CODE'].value_counts().index[0]
#     print(city_name,city_code)
    return city_name,city_code

def nested_dict():
  """
  This returns a empty nestable dictionary.
  """
  return defaultdict(nested_dict)




In [26]:
getCodeName(420035629002,df_ems)

('PITTSBURGH', 'PGH')

In [27]:
# Get index for missing values at CITY_CODE and CITY_NAME
index_for_missing_values = df_ems[(df_ems.CITY_NAME.isnull()) | (df_ems.CITY_CODE.isnull())].index.tolist()
index_for_missing_values[:10]

[7200, 7237, 29818, 45426, 51944, 69156, 104522, 104614, 107710, 114511]

In [28]:
# Fill the missing values
for index in index_for_missing_values:
    geoid = df_ems.loc[index,'GEOID']
    ct_name, ct_code = getCodeName(geoid,df_ems)
    
    df_ems.loc[index,'CITY_NAME'] = ct_name
    df_ems.loc[index,'CITY_CODE'] = ct_code

# Recheck missing at CITY_CODE and CITY_NAME
df_ems[(df_ems.CITY_NAME.isnull()) | (df_ems.CITY_CODE.isnull())].index.tolist()


[]

In [29]:
df_ems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780698 entries, 0 to 780697
Data columns (total 11 columns):
Call_ID_Hash         780698 non-null object
SERVICE              780698 non-null object
PRIORITY             780698 non-null object
PRIORITY_DESC        780698 non-null object
AGENCY               780698 non-null object
CALL_QUARTER         780698 non-null object
CALL_YEAR            780698 non-null int64
DESCRIPTION_SHORT    778103 non-null object
CITY_CODE            780698 non-null object
CITY_NAME            780698 non-null object
GEOID                780698 non-null int64
dtypes: int64(2), object(9)
memory usage: 65.5+ MB


In [30]:
df_ems[df_ems.DESCRIPTION_SHORT.isnull()]['PRIORITY_DESC'].value_counts()

EMS Admin Mark out                             2491
EMS ALS life threatening response                31
EMS Standard BLS response                        31
EMS Standard ALS response                        21
EMS ALS life threatening response w/ backup      12
EMS BLS Assistance response                       8
Fire Notifications                                1
Name: PRIORITY_DESC, dtype: int64

In [31]:
df_ems['DESCRIPTION_SHORT'].fillna('MISSING',inplace=True)

In [32]:
df_ems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 780698 entries, 0 to 780697
Data columns (total 11 columns):
Call_ID_Hash         780698 non-null object
SERVICE              780698 non-null object
PRIORITY             780698 non-null object
PRIORITY_DESC        780698 non-null object
AGENCY               780698 non-null object
CALL_QUARTER         780698 non-null object
CALL_YEAR            780698 non-null int64
DESCRIPTION_SHORT    780698 non-null object
CITY_CODE            780698 non-null object
CITY_NAME            780698 non-null object
GEOID                780698 non-null int64
dtypes: int64(2), object(9)
memory usage: 65.5+ MB


In [33]:
ct_names = df_ems.CITY_NAME.unique().tolist()
ct_names[:10]

['PORT VUE',
 'PENN HILLS',
 'MCKEESPORT',
 'PITTSBURGH',
 'ROSS',
 'SOUTH FAYETTE',
 'FRANKLIN PARK',
 'DUQUESNE',
 'PITCAIRN',
 'WEST MIFFLIN']

In [34]:
# Check Westmoreland county in city names
df_ems[df_ems.CITY_NAME.isin(['WESTMORELAND COUNTY'])]

Unnamed: 0,Call_ID_Hash,SERVICE,PRIORITY,PRIORITY_DESC,AGENCY,CALL_QUARTER,CALL_YEAR,DESCRIPTION_SHORT,CITY_CODE,CITY_NAME,GEOID
211893,eb764b289f44,EMS,E1,EMS ALS life threatening response,E590,Q4,2015,COMMERCIAL OR APARTMENT BLDG FIRE,WST,WESTMORELAND COUNTY,420034961023


In [35]:
# Drop westmoreland county
print("No. of rows before dropping = ", len(df_ems))
df_ems.drop(df_ems[df_ems.CITY_NAME.isin(['WESTMORELAND COUNTY'])].index, inplace=True)
print("No. of rows after dropping = ", len(df_ems))

No. of rows before dropping =  780698
No. of rows after dropping =  780697


In [38]:
# Save data after filling missing values.
save_obj(df_ems,'df_ems')
df_ems = load_obj('df_ems')

In [39]:
df_ems.head()

Unnamed: 0,Call_ID_Hash,SERVICE,PRIORITY,PRIORITY_DESC,AGENCY,CALL_QUARTER,CALL_YEAR,DESCRIPTION_SHORT,CITY_CODE,CITY_NAME,GEOID
0,336ba345f5a0,EMS,E0,EMS ALS life threatening response w/ backup,E620,Q1,2015,HEMORRHAGE,PVU,PORT VUE,420035003002
1,f59d50d394fa,EMS,E0,EMS ALS life threatening response w/ backup,E380,Q1,2015,UNCONSCIOUS,PEN,PENN HILLS,420035235011
2,8d56b4f6f9fe,EMS,E0,EMS ALS life threatening response w/ backup,E620,Q1,2015,UNCONSCIOUS,MCK,MCKEESPORT,420035523003
3,e20045d2f6b4,EMS,E0,EMS ALS life threatening response w/ backup,E050,Q1,2015,"GUNSHOT, STABBING, OR OTHER WOUND",PGH,PITTSBURGH,420031306003
4,dbe1c9db46f6,EMS,E0,EMS ALS life threatening response w/ backup,E050,Q1,2015,ASSAULT,PGH,PITTSBURGH,420031609002
