# ETL Pipeline- Working Copy

## Imports

In [1]:
# Std lib:
import os
import warnings

# Querying data:
from google.cloud import bigquery

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

## Configure credentials

In [2]:
key_location = # Your key location here (remove before commit). Maybe we can use os to configure this automatically?
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_location

## Set up client

In [3]:
client = bigquery.Client()
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
table_ref = dataset_ref.table("crime")
table = client.get_table(table_ref)

## Query the Crime Table

In [18]:
# ENTER YOUR QUERY HERE:

MAX_GB = 4 # Change this if desired

QUERY = """
SELECT *
FROM `bigquery-public-data.chicago_crime.crime`
WHERE year in (2017, 2018, 2019, 2020)
"""

In [19]:
def safe_query_to_dataframe(client, table, sql_query, max_gb=0):
    """
    Wrapper function for bigquery.client.query.  Will throw an error if the query exceeds the desired limit.
        
    params
        > client: a bigquery client object
        > table: a bigquery table object
        > sql_query (string): an SQL query on the table
        > max_gb (int): GB limit of query
        
    returns
        > Error: if query size exceeds limit
        > Dataframe: Dataframe representation of the query
    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    query_job = client.query(sql_query, job_config=job_config)
    gbs_used = query_job.total_bytes_processed / 1e9
    assert gbs_used < max_gb, f"This query will process {gbs_used} GB, which exceeds your desired limit of {max_gb} GB."
    query_job = client.query(sql_query)
    return query_job.to_dataframe()

In [20]:
df = safe_query_to_dataframe(client=client, table=table, sql_query=QUERY, max_gb=MAX_GB)

## Pipeline

In [21]:
# We can remove objects (ie. rows) with any null values since we have huge dataset with millions of rows
def remove_null_objects(df):
    """
    Removes objects with any null values -> less than 3000 rows dropped
    """
    df.dropna(inplace = True)
    
    
def add_dt_attributes(df):
    """
    Adds datetime attributes for easier querying
    """
    df["month"] = pd.to_numeric(df.date.dt.month, downcast="unsigned")
    df["hour"] = pd.to_numeric(df.date.dt.hour, downcast="unsigned")
    df["dayofweek"] = pd.to_numeric(df.date.dt.dayofweek, downcast="unsigned")
    df["weekend"] = df.date.dt.dayofweek >= 5
    

def remove_irrelevant_attributes(df): 
    """
    Remove attributes not needed for EDA or data mining 
    """
    df.drop(columns = ["unique_key", "case_number", "updated_on"], inplace = True)
    

def add_fbi_code_description(df):
    """
    Add fbi_code_description attribute for details and querying
    """
    df["fbi_code_description"] = np.where(df.fbi_code.str.startswith("01"), "Murder", 
                                 np.where(df.fbi_code.str.startswith("02"), "Rape", 
                                 np.where(df.fbi_code.str.startswith("03"), "Robbery",
                                 np.where(df.fbi_code.str.startswith("04"), "Assault",
                                 np.where(df.fbi_code.str.startswith("05"), "Burglary",
                                 np.where(df.fbi_code.str.startswith("06"), "Theft",
                                 np.where(df.fbi_code.str.startswith("07"), "Auto Theft",
                                 np.where(df.fbi_code.str.startswith("09"), "Forgery",
                                 np.where(df.fbi_code.str.startswith("10"), "Fraud/Embezzlement",
                                 np.where(df.fbi_code.str.startswith("11"), "Shots Fired",
                                 np.where(df.fbi_code.str.startswith("13"), "Prostitution",
                                 np.where(df.fbi_code.str.startswith("14"), "Indecent Exposure",
                                 np.where(df.fbi_code.str.startswith("15"), "Domestic Violence",
                                 np.where(df.fbi_code.str.startswith("16"), "Narcotics",
                                 np.where(df.fbi_code.str.startswith("17"), "Violation of liquor laws",
                                 np.where(df.fbi_code.str.startswith("18"), "Intoxicated subject",
                                 np.where(df.fbi_code.str.startswith("19"), "Disorderly conduct",
                                 np.where(df.fbi_code.str.startswith("20"), "Vagrants",
                                 np.where(df.fbi_code.str.startswith("21"), "Gambling",
                                 np.where(df.fbi_code.str.startswith("22"), "DWI",
                                 np.where(df.fbi_code.str.startswith("23"), "Reckless driving",
                                 np.where(df.fbi_code.str.startswith("24"), "Suspicious vehicle/person",
                                 np.where(df.fbi_code.str.startswith("26"), "All others: arson/vandalism",
                                 np.where(df.fbi_code.str.startswith("27"), "Are you in trouble",
                                 np.where(df.fbi_code.str.startswith("28"), "Frequent patrol",
                                 np.where(df.fbi_code.str.startswith("29"), "Dog bite",
                                 np.where(df.fbi_code.str.startswith("30"), "Suicide",
                                 np.where(df.fbi_code.str.startswith("31"), "Miscellaneous deaths",
                                 np.where(df.fbi_code.str.startswith("32"), "Nature call",
                                 np.where(df.fbi_code.str.startswith("33"), "Welfare check",
                                 np.where(df.fbi_code.str.startswith("34"), "Affray in progress",
                                 np.where(df.fbi_code.str.startswith("35"), "Alarm call",
                                 np.where(df.fbi_code.str.startswith("40"), "En-route to district",
                                 np.where(df.fbi_code.str.startswith("41"), "En-route to home", "Unidentified"))))))))))))))))))))))))))))))))))
    df["fbi_code_description"] = df["fbi_code_description"].astype("category")
                                                                     

def convert_coords_to_geometry(df):
    """
    Converts lat/lon attributes to more usable geometry objects
    """
    df['geometry'] = list(zip(df.longitude, df.latitude))
    df['geometry'] = df['geometry'].apply(Point)
    df = geopandas.GeoDataFrame(df, geometry='geometry')
    
    

def join_with_econ(df):
    """
    Joins crime data with revised_econ data
    """
    econ_df = pd.read_excel("../data/cfnai/cfnai-realtime-revised.xlsx")
    econ_df['month'] = econ_df.Month.dt.month
    econ_df['month'] = pd.to_numeric(econ_df.month, downcast="unsigned")
    econ_df['year'] = econ_df.Month.dt.year
    econ_df.drop("Month", inplace=True, axis=1)
    return pd.merge(df, econ_df) # Can't do this inplace :(


def etl_pipeline(df):
    """
    This is the whole pipeline.  
    Add function calls to mutate the inputted dataframe into something that we can work with.
    """
    remove_null_objects(df)
    add_dt_attributes(df)
    remove_irrelevant_attributes(df)
    add_fbi_code_description(df)
    convert_coords_to_geometry(df)
    df = join_with_econ(df)
    return df

## Check out the Results

### Original

In [22]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11193972,JA536982,2017-12-04 21:44:16+00:00,053XX W ARGYLE ST,2022,NARCOTICS,POSS: COCAINE,RESIDENCE,True,False,1623,16.0,45.0,11,18,,,2017,2018-04-07 15:54:56+00:00,,,
1,11733722,JC319764,2019-06-19 16:00:00+00:00,045XX W THORNDALE AVE,5001,OTHER OFFENSE,OTHER CRIME INVOLVING PROPERTY,RESIDENTIAL YARD (FRONT/BACK),False,False,1711,17.0,39.0,12,26,1145270.0,1938820.0,2019,2019-06-30 15:56:27+00:00,41.988126,-87.741051,"(41.988126496, -87.741051435)"
2,11741621,JC329399,2019-06-30 22:45:00+00:00,093XX S BENNETT AVE,650,BURGLARY,HOME INVASION,APARTMENT,False,True,413,4.0,8.0,48,05,1190473.0,1843564.0,2019,2019-07-07 16:14:54+00:00,41.725762,-87.577876,"(41.725762292, -87.57787575)"
3,11744966,JC333384,2019-07-03 15:57:00+00:00,032XX W 109TH ST,440,BATTERY,AGG: HANDS/FIST/FEET NO/MINOR INJURY,STREET,True,False,2211,22.0,19.0,74,08B,1156688.0,1832156.0,2019,2019-11-07 15:51:53+00:00,41.695204,-87.701939,"(41.695203685, -87.70193895)"
4,11754466,JC344832,2019-07-02 08:00:00+00:00,012XX E 78TH ST,495,BATTERY,AGGRAVATED OF A SENIOR CITIZEN,APARTMENT,False,False,411,4.0,8.0,45,04B,1185660.0,1853526.0,2019,2019-07-19 16:09:50+00:00,41.753214,-87.595193,"(41.753213625, -87.595192816)"


#### Check for missing data to resolve them

In [23]:
# There are null values in location_description, x_coordinate, y_coordinate, latitude, longitude and location
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 970489 entries, 0 to 970488
Data columns (total 22 columns):
unique_key              970489 non-null int64
case_number             970489 non-null object
date                    970489 non-null datetime64[ns, UTC]
block                   970489 non-null object
iucr                    970489 non-null object
primary_type            970489 non-null object
description             970489 non-null object
location_description    966237 non-null object
arrest                  970489 non-null bool
domestic                970489 non-null bool
beat                    970489 non-null int64
district                970488 non-null float64
ward                    970461 non-null float64
community_area          970489 non-null int64
fbi_code                970489 non-null object
x_coordinate            951634 non-null float64
y_coordinate            951634 non-null float64
year                    970489 non-null int64
updated_on              970489 non

In [24]:
# A lot of unknown location_descriptions
df[df.location_description.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
87,11703065,JC282341,2019-05-28 13:30:00+00:00,021XX S CHINA PL,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,914,9.0,25.0,34,11,1174606.0,1890151.0,2019,2019-06-30 15:56:27+00:00,41.853969,-87.63461,"(41.853969331, -87.634610162)"
97,11826168,JC430502,2019-09-04 18:50:00+00:00,127XX S MANISTEE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,433,4.0,10.0,55,11,1196750.0,1821088.0,2019,2019-09-15 15:59:13+00:00,41.663932,-87.555627,"(41.663932409, -87.555626566)"
100,11843016,JC450453,2018-07-23 11:30:00+00:00,009XX E 111TH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,531,5.0,9.0,50,11,,,2018,2019-09-28 16:01:29+00:00,,,
106,11898754,JC519229,2019-11-06 14:35:00+00:00,099XX S WINCHESTER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,2213,22.0,19.0,72,11,1165216.0,1838748.0,2019,2019-11-24 15:53:02+00:00,41.713117,-87.670529,"(41.713117439, -87.67052926)"
200,10881432,JA183411,2017-02-20 01:00:00+00:00,057XX N ODELL AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1612,16.0,41.0,10,11,1126356.0,1937208.0,2017,2018-02-10 15:50:01+00:00,41.984041,-87.810656,"(41.984040859, -87.810656239)"


In [25]:
# x_coordinate, y_coordinate, latitude, longitude and location have null values all on same objects
df[df.x_coordinate.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11193972,JA536982,2017-12-04 21:44:16+00:00,053XX W ARGYLE ST,2022,NARCOTICS,POSS: COCAINE,RESIDENCE,True,False,1623,16.0,45.0,11,18,,,2017,2018-04-07 15:54:56+00:00,,,
30,12194140,JD400023,2020-08-09 06:00:00+00:00,058XX N LACEY AVE,1120,DECEPTIVE PRACTICE,FORGERY,RESIDENCE,False,False,1621,16.0,39.0,12,10,,,2020,2020-10-16 15:49:38+00:00,,,
31,12193921,JD399935,2020-10-15 02:43:00+00:00,056XX N HARLEM AVE,501A,OTHER OFFENSE,ANIMAL ABUSE / NEGLECT,VEHICLE NON-COMMERCIAL,False,False,1612,16.0,41.0,10,26,,,2020,2020-10-22 15:48:01+00:00,,,
32,12206440,JD414620,2020-09-26 01:40:00+00:00,078XX S AVALON AVE,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,APARTMENT,False,False,411,4.0,8.0,45,11,,,2020,2020-10-30 15:51:25+00:00,,,
100,11843016,JC450453,2018-07-23 11:30:00+00:00,009XX E 111TH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,531,5.0,9.0,50,11,,,2018,2019-09-28 16:01:29+00:00,,,


#### Check for redundant data to resolve them

In [26]:
# Great! No duplicate objects
df.duplicated().sum()

0

### Pipelined

In [27]:
cp = df.copy()

In [28]:
cp = etl_pipeline(cp);

In [29]:
cp.head()

Unnamed: 0,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,location,month,hour,dayofweek,weekend,fbi_code_description,geometry,CF,CF3,PI,EUH,CH,SOI
0,2019-06-19 16:00:00+00:00,045XX W THORNDALE AVE,5001,OTHER OFFENSE,OTHER CRIME INVOLVING PROPERTY,RESIDENTIAL YARD (FRONT/BACK),False,False,1711,17.0,39.0,12,26,1145270.0,1938820.0,2019,41.988126,-87.741051,"(41.988126496, -87.741051435)",6,16,2,False,All others: arson/vandalism,POINT (-87.74105 41.98813),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
1,2019-06-30 22:45:00+00:00,093XX S BENNETT AVE,650,BURGLARY,HOME INVASION,APARTMENT,False,True,413,4.0,8.0,48,05,1190473.0,1843564.0,2019,41.725762,-87.577876,"(41.725762292, -87.57787575)",6,22,6,True,Burglary,POINT (-87.57788 41.72576),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
2,2019-06-12 14:30:00+00:00,021XX E 87TH ST,545,ASSAULT,PRO EMP HANDS NO/MIN INJURY,"SCHOOL, PUBLIC, GROUNDS",False,False,412,4.0,8.0,45,08A,1191768.0,1847705.0,2019,41.737094,-87.572998,"(41.737094305, -87.572998178)",6,14,2,False,Unidentified,POINT (-87.57300 41.73709),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
3,2019-06-17 14:00:00+00:00,096XX S AVENUE L,498,BATTERY,AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET S...,APARTMENT,False,True,432,4.0,10.0,52,04B,1201786.0,1841740.0,2019,41.720477,-87.536498,"(41.720477181, -87.536498395)",6,14,0,False,Assault,POINT (-87.53650 41.72048),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
4,2019-06-03 00:01:00+00:00,089XX S DORCHESTER AVE,1751,OFFENSE INVOLVING CHILDREN,CRIM SEX ABUSE BY FAM MEMBER,RESIDENCE,False,True,413,4.0,8.0,48,17,1187019.0,1845781.0,2019,41.731928,-87.590458,"(41.731928463, -87.590457683)",6,0,0,False,Violation of liquor laws,POINT (-87.59046 41.73193),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077


In [30]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 923997 entries, 0 to 923996
Data columns (total 31 columns):
date                    923997 non-null datetime64[ns, UTC]
block                   923997 non-null object
iucr                    923997 non-null object
primary_type            923997 non-null object
description             923997 non-null object
location_description    923997 non-null object
arrest                  923997 non-null bool
domestic                923997 non-null bool
beat                    923997 non-null int64
district                923997 non-null float64
ward                    923997 non-null float64
community_area          923997 non-null int64
fbi_code                923997 non-null object
x_coordinate            923997 non-null float64
y_coordinate            923997 non-null float64
year                    923997 non-null int64
latitude                923997 non-null float64
longitude               923997 non-null float64
location                923997 

In [32]:
# Export cleaned data
# cp.to_csv('../data/cleaned_data.csv', index=False) # Uncommenting and running it will overwrite the DW