# ETL Pipeline- Working Copy

## Imports

In [214]:
import os
from google.cloud import bigquery
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import warnings
from IPython.display import display

warnings.filterwarnings('ignore')
%matplotlib inline

style.use('seaborn')

pd.options.display.max_columns = None

## Configure credentials

In [215]:
key_location = # Your key location here (remove before commit). Maybe we can use os to configure this automatically?
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_location

## Set up client

In [216]:
client = bigquery.Client()
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
table_ref = dataset_ref.table("crime")
table = client.get_table(table_ref)

## Query the Crime Table

In [217]:
# ENTER YOUR QUERY HERE:

MAX_GB = 2 # Change this if desired

QUERY = """
SELECT *
FROM `bigquery-public-data.chicago_crime.crime`
WHERE year in (2016)
"""

In [218]:
def safe_query_to_dataframe(client, table, sql_query, max_gb=0):
    """
    Wrapper function for bigquery.client.query.  Will throw an error if the query exceeds the desired limit.
        
    params
        > client: a bigquery client object
        > table: a bigquery table object
        > sql_query (string): an SQL query on the table
        > max_gb (int): GB limit of query
        
    returns
        > Error: if query size exceeds limit
        > Dataframe: Dataframe representation of the query
    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    query_job = client.query(sql_query, job_config=job_config)
    gbs_used = query_job.total_bytes_processed / 1e9
    assert gbs_used < max_gb, f"This query will process {gbs_used} GB, which exceeds your desired limit of {max_gb} GB."
    query_job = client.query(sql_query)
    return query_job.to_dataframe()

In [219]:
df = safe_query_to_dataframe(client=client, table=table, sql_query=QUERY, max_gb=MAX_GB)

## Pipeline

In [220]:
# We can remove objects (ie. rows) with any null values since we have huge dataset with millions of rows
def remove_null_objects(df):
    """
    Removes objects with any null values -> less than 3000 rows dropped
    """
    df.dropna(inplace = True)
    
    
def add_dt_attributes(df):
    """
    Adds datetime attributes for easier querying
    """
    df["month"] = pd.to_numeric(df.date.dt.month, downcast="unsigned")
    df["hour"] = pd.to_numeric(df.date.dt.hour, downcast="unsigned")
    df["dayofweek"] = pd.to_numeric(df.date.dt.dayofweek, downcast="unsigned")
    df["weekend"] = df.date.dt.dayofweek >= 5
    

def remove_irrelevant_attributes(df): 
    """
    Remove attributes not needed for EDA or data mining 
    """
    df.drop(columns = ["unique_key", "case_number", "updated_on"], inplace = True)
    

def add_fbi_code_description(df):
    """
    Add fbi_code_description attribute for details and querying
    """
    df["fbi_code_description"] = np.where(df.fbi_code.str.startswith("01"), "Murder", 
                                 np.where(df.fbi_code.str.startswith("02"), "Rape", 
                                 np.where(df.fbi_code.str.startswith("03"), "Robbery",
                                 np.where(df.fbi_code.str.startswith("04"), "Assault",
                                 np.where(df.fbi_code.str.startswith("05"), "Burglary",
                                 np.where(df.fbi_code.str.startswith("06"), "Theft",
                                 np.where(df.fbi_code.str.startswith("07"), "Auto Theft",
                                 np.where(df.fbi_code.str.startswith("09"), "Forgery",
                                 np.where(df.fbi_code.str.startswith("10"), "Fraud/Embezzlement",
                                 np.where(df.fbi_code.str.startswith("11"), "Shots Fired",
                                 np.where(df.fbi_code.str.startswith("13"), "Prostitution",
                                 np.where(df.fbi_code.str.startswith("14"), "Indecent Exposure",
                                 np.where(df.fbi_code.str.startswith("15"), "Domestic Violence",
                                 np.where(df.fbi_code.str.startswith("16"), "Narcotics",
                                 np.where(df.fbi_code.str.startswith("17"), "Violation of liquor laws",
                                 np.where(df.fbi_code.str.startswith("18"), "Intoxicated subject",
                                 np.where(df.fbi_code.str.startswith("19"), "Disorderly conduct",
                                 np.where(df.fbi_code.str.startswith("20"), "Vagrants",
                                 np.where(df.fbi_code.str.startswith("21"), "Gambling",
                                 np.where(df.fbi_code.str.startswith("22"), "DWI",
                                 np.where(df.fbi_code.str.startswith("23"), "Reckless driving",
                                 np.where(df.fbi_code.str.startswith("24"), "Suspicious vehicle/person",
                                 np.where(df.fbi_code.str.startswith("26"), "All others: arson/vandalism",
                                 np.where(df.fbi_code.str.startswith("27"), "Are you in trouble",
                                 np.where(df.fbi_code.str.startswith("28"), "Frequent patrol",
                                 np.where(df.fbi_code.str.startswith("29"), "Dog bite",
                                 np.where(df.fbi_code.str.startswith("30"), "Suicide",
                                 np.where(df.fbi_code.str.startswith("31"), "Miscellaneous deaths",
                                 np.where(df.fbi_code.str.startswith("32"), "Nature call",
                                 np.where(df.fbi_code.str.startswith("33"), "Welfare check",
                                 np.where(df.fbi_code.str.startswith("34"), "Affray in progress",
                                 np.where(df.fbi_code.str.startswith("35"), "Alarm call",
                                 np.where(df.fbi_code.str.startswith("40"), "En-route to district",
                                 np.where(df.fbi_code.str.startswith("41"), "En-route to home", "Unidentified"))))))))))))))))))))))))))))))))))
                                                                     
        

def etl_pipeline(df):
    """
    This is the whole pipeline.  
    Add function calls to mutate the inputted dataframe into something that we can work with.
    """
    remove_null_objects(df)
    add_dt_attributes(df)
    remove_irrelevant_attributes(df)
    add_fbi_code_description(df)

## Check out the Results

### Original

In [221]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,10389477,HZ126301,2016-01-23 19:25:00+00:00,021XX S ARCHER AVE,420,BATTERY,AGGRAVATED:KNIFE/CUTTING INSTR,RESTAURANT,False,False,914,9,25,34,04B,1174914.0,1889985.0,2016,2018-02-10 15:50:01+00:00,41.853507,-87.633485,"(41.853506932, -87.63348466)"
1,10390759,HZ127659,2016-01-25 05:40:00+00:00,062XX S SAYRE AVE,1020,ARSON,BY FIRE,VEHICLE NON-COMMERCIAL,False,False,812,8,23,64,09,1130811.0,1862162.0,2016,2018-02-10 15:50:01+00:00,41.778028,-87.795999,"(41.778028051, -87.795999196)"
2,10392601,HZ129127,2016-01-26 09:15:00+00:00,100XX S WOODLAWN AVE,545,ASSAULT,PRO EMP HANDS NO/MIN INJURY,"SCHOOL, PUBLIC, BUILDING",False,False,511,5,8,50,08A,1186531.0,1838828.0,2016,2018-02-10 15:50:01+00:00,41.71286,-87.592465,"(41.7128602, -87.592464611)"
3,10394255,HZ130555,2016-01-27 04:09:00+00:00,108XX S BUFFALO AVE,1477,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,PARK PROPERTY,False,False,432,4,10,52,15,1199874.0,1833672.0,2016,2018-02-10 15:50:01+00:00,41.698386,-87.543772,"(41.698386241, -87.543772373)"
4,10397507,HZ133742,2016-01-29 23:40:00+00:00,134XX S BRANDON AVE,496,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,APARTMENT,False,True,433,4,10,55,04B,1199444.0,1816395.0,2016,2018-02-10 15:50:01+00:00,41.650987,-87.545925,"(41.650987156, -87.54592485)"


#### Check for missing data to resolve them

In [222]:
# There are null values in location_description, x_coordinate, y_coordinate, latitude, longitude and location
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269429 entries, 0 to 269428
Data columns (total 22 columns):
unique_key              269429 non-null int64
case_number             269429 non-null object
date                    269429 non-null datetime64[ns, UTC]
block                   269429 non-null object
iucr                    269429 non-null object
primary_type            269429 non-null object
description             269429 non-null object
location_description    268211 non-null object
arrest                  269429 non-null bool
domestic                269429 non-null bool
beat                    269429 non-null int64
district                269429 non-null int64
ward                    269429 non-null int64
community_area          269429 non-null int64
fbi_code                269429 non-null object
x_coordinate            267230 non-null float64
y_coordinate            267230 non-null float64
year                    269429 non-null int64
updated_on              269429 non-nul

In [223]:
# A lot of unknown location_descriptions
df[df.location_description.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
58,10904569,JA216297,2016-03-01 09:25:00+00:00,069XX W 64TH PL,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,812,8,23,64,11,1131280.0,1860968.0,2016,2018-02-10 15:50:01+00:00,41.774743,-87.794307,"(41.774743429, -87.794307219)"
106,10608012,HZ359264,2016-07-06 23:00:00+00:00,067XX N LEOTI AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1621,16,41,12,11,1135928.0,1944356.0,2016,2018-02-10 15:50:01+00:00,42.00349,-87.77528,"(42.003489863, -87.775279909)"
116,10777217,HZ543475,2016-10-26 19:40:00+00:00,057XX N AVONDALE AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,1612,16,41,10,11,1132268.0,1937834.0,2016,2018-02-10 15:50:01+00:00,41.985658,-87.788898,"(41.985657601, -87.788897731)"
187,10453593,HZ191631,2016-03-09 19:50:00+00:00,060XX N NAVARRE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1611,16,41,10,11,1130431.0,1940062.0,2016,2018-02-10 15:50:01+00:00,41.991803,-87.795603,"(41.991803284, -87.795602791)"
196,10556507,HZ302537,2016-06-10 15:15:00+00:00,051XX N NORDICA AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,1613,16,41,10,11,1128618.0,1933802.0,2016,2018-02-10 15:50:01+00:00,41.974656,-87.802414,"(41.974656274, -87.80241441)"


In [224]:
# x_coordinate, y_coordinate, latitude, longitude and location have null values all on same objects
df[df.x_coordinate.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
18,12082538,JD269020,2016-10-01 00:01:00+00:00,064XX N NORTHWEST HWY,281,CRIMINAL SEXUAL ASSAULT,NON-AGGRAVATED,APARTMENT,False,True,1612,16,41,9,2,,,2016,2020-06-20 15:51:17+00:00,,,
181,11280270,JB217247,2016-09-01 19:00:00+00:00,097XX S DAMEN AVE,281,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE-GARAGE,False,False,2213,22,19,72,2,,,2016,2018-04-10 15:52:52+00:00,,,
241,11263003,JB194803,2016-09-19 13:05:00+00:00,097XX S DOBSON AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,511,5,8,50,11,,,2016,2018-03-22 15:53:18+00:00,,,
341,11924141,JC548785,2016-06-01 09:00:00+00:00,095XX S WINCHESTER AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,RESIDENCE,False,False,2213,22,19,72,11,,,2016,2019-12-17 15:55:38+00:00,,,
424,11378015,JB346764,2016-06-02 05:30:00+00:00,025XX N NORDICA AVE,1751,OFFENSE INVOLVING CHILDREN,CRIM SEX ABUSE BY FAM MEMBER,RESIDENCE,False,True,2512,25,36,18,20,,,2016,2018-07-13 15:50:04+00:00,,,


#### Check for redundant data to resolve them

In [225]:
# Great! No duplicate objects
df.duplicated().sum()

0

### Pipelined

In [226]:
cp = df.copy()

In [227]:
etl_pipeline(cp)

In [228]:
cp.head()

Unnamed: 0,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,location,month,hour,dayofweek,weekend,fbi_code_description
0,2016-01-23 19:25:00+00:00,021XX S ARCHER AVE,420,BATTERY,AGGRAVATED:KNIFE/CUTTING INSTR,RESTAURANT,False,False,914,9,25,34,04B,1174914.0,1889985.0,2016,41.853507,-87.633485,"(41.853506932, -87.63348466)",1,19,5,True,Assault
1,2016-01-25 05:40:00+00:00,062XX S SAYRE AVE,1020,ARSON,BY FIRE,VEHICLE NON-COMMERCIAL,False,False,812,8,23,64,09,1130811.0,1862162.0,2016,41.778028,-87.795999,"(41.778028051, -87.795999196)",1,5,0,False,Forgery
2,2016-01-26 09:15:00+00:00,100XX S WOODLAWN AVE,545,ASSAULT,PRO EMP HANDS NO/MIN INJURY,"SCHOOL, PUBLIC, BUILDING",False,False,511,5,8,50,08A,1186531.0,1838828.0,2016,41.71286,-87.592465,"(41.7128602, -87.592464611)",1,9,1,False,Unidentified
3,2016-01-27 04:09:00+00:00,108XX S BUFFALO AVE,1477,WEAPONS VIOLATION,RECKLESS FIREARM DISCHARGE,PARK PROPERTY,False,False,432,4,10,52,15,1199874.0,1833672.0,2016,41.698386,-87.543772,"(41.698386241, -87.543772373)",1,4,2,False,Domestic Violence
4,2016-01-29 23:40:00+00:00,134XX S BRANDON AVE,496,BATTERY,AGGRAVATED DOMESTIC BATTERY: KNIFE/CUTTING INST,APARTMENT,False,True,433,4,10,55,04B,1199444.0,1816395.0,2016,41.650987,-87.545925,"(41.650987156, -87.54592485)",1,23,4,False,Assault


In [229]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266452 entries, 0 to 269428
Data columns (total 24 columns):
date                    266452 non-null datetime64[ns, UTC]
block                   266452 non-null object
iucr                    266452 non-null object
primary_type            266452 non-null object
description             266452 non-null object
location_description    266452 non-null object
arrest                  266452 non-null bool
domestic                266452 non-null bool
beat                    266452 non-null int64
district                266452 non-null int64
ward                    266452 non-null int64
community_area          266452 non-null int64
fbi_code                266452 non-null object
x_coordinate            266452 non-null float64
y_coordinate            266452 non-null float64
year                    266452 non-null int64
latitude                266452 non-null float64
longitude               266452 non-null float64
location                266452 non-

In [230]:
# TODO: export the cleaned data to a csv