# ETL Pipeline- Working Copy

## Imports

In [3]:
# Std lib:
import os
import warnings
from dotenv import load_dotenv

# Querying data:
from google.cloud import bigquery

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

## Configure credentials

In [4]:
# Specify the file path location of your google project .json key in 
# a local .env file. The .env file will be excluded from .gitignore file. 
# An example .env file is included in repo, make edits and remove .example file type. 
# If you do not have python dotenv module, you will need to install it with:
# pip install python-dotenv  (also should be available for conda)

load_dotenv()
key = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

## Set up client

In [5]:
client = bigquery.Client()
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
table_ref = dataset_ref.table("crime")
table = client.get_table(table_ref)

## Query the Crime Table

In [6]:
# ENTER YOUR QUERY HERE:

MAX_GB = 4 # Change this if desired

QUERY = """
SELECT *
FROM `bigquery-public-data.chicago_crime.crime`
WHERE year in (2017, 2018, 2019, 2020)
"""

In [7]:
def safe_query_to_dataframe(client, table, sql_query, max_gb=0):
    """
    Wrapper function for bigquery.client.query.  Will throw an error if the query exceeds the desired limit.
        
    params
        > client: a bigquery client object
        > table: a bigquery table object
        > sql_query (string): an SQL query on the table
        > max_gb (int): GB limit of query
        
    returns
        > Error: if query size exceeds limit
        > Dataframe: Dataframe representation of the query
    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    query_job = client.query(sql_query, job_config=job_config)
    gbs_used = query_job.total_bytes_processed / 1e9
    assert gbs_used < max_gb, f"This query will process {gbs_used} GB, which exceeds your desired limit of {max_gb} GB."
    query_job = client.query(sql_query)
    return query_job.to_dataframe()

In [8]:
df = safe_query_to_dataframe(client=client, table=table, sql_query=QUERY, max_gb=MAX_GB)

## Pipeline

In [10]:
# We can remove objects (ie. rows) with any null values since we have huge dataset with millions of rows
def remove_null_objects(df):
    """
    Removes objects with any null values -> less than 3000 rows dropped
    """
    df.dropna(inplace = True)
    
    
def add_dt_attributes(df):
    """
    Adds datetime attributes for easier querying
    """
    df["month"] = pd.to_numeric(df.date.dt.month, downcast="unsigned")
    df["hour"] = pd.to_numeric(df.date.dt.hour, downcast="unsigned")
    df["dayofweek"] = pd.to_numeric(df.date.dt.dayofweek, downcast="unsigned")
    df["weekend"] = df.date.dt.dayofweek >= 5
    

def remove_irrelevant_attributes(df): 
    """
    Remove attributes not needed for EDA or data mining 
    """
    df.drop(columns = ["unique_key", "case_number", "updated_on"], inplace = True)
    

def add_fbi_code_description(df):
    """
    Add fbi_code_description attribute for details and querying
    """
    df["fbi_code_description"] = np.where(df.fbi_code.str.startswith("01"), "Murder", 
                                 np.where(df.fbi_code.str.startswith("02"), "Rape", 
                                 np.where(df.fbi_code.str.startswith("03"), "Robbery",
                                 np.where(df.fbi_code.str.startswith("04"), "Assault",
                                 np.where(df.fbi_code.str.startswith("05"), "Burglary",
                                 np.where(df.fbi_code.str.startswith("06"), "Theft",
                                 np.where(df.fbi_code.str.startswith("07"), "Auto Theft",
                                 np.where(df.fbi_code.str.startswith("09"), "Forgery",
                                 np.where(df.fbi_code.str.startswith("10"), "Fraud/Embezzlement",
                                 np.where(df.fbi_code.str.startswith("11"), "Shots Fired",
                                 np.where(df.fbi_code.str.startswith("13"), "Prostitution",
                                 np.where(df.fbi_code.str.startswith("14"), "Indecent Exposure",
                                 np.where(df.fbi_code.str.startswith("15"), "Domestic Violence",
                                 np.where(df.fbi_code.str.startswith("16"), "Narcotics",
                                 np.where(df.fbi_code.str.startswith("17"), "Violation of liquor laws",
                                 np.where(df.fbi_code.str.startswith("18"), "Intoxicated subject",
                                 np.where(df.fbi_code.str.startswith("19"), "Disorderly conduct",
                                 np.where(df.fbi_code.str.startswith("20"), "Vagrants",
                                 np.where(df.fbi_code.str.startswith("21"), "Gambling",
                                 np.where(df.fbi_code.str.startswith("22"), "DWI",
                                 np.where(df.fbi_code.str.startswith("23"), "Reckless driving",
                                 np.where(df.fbi_code.str.startswith("24"), "Suspicious vehicle/person",
                                 np.where(df.fbi_code.str.startswith("26"), "All others: arson/vandalism",
                                 np.where(df.fbi_code.str.startswith("27"), "Are you in trouble",
                                 np.where(df.fbi_code.str.startswith("28"), "Frequent patrol",
                                 np.where(df.fbi_code.str.startswith("29"), "Dog bite",
                                 np.where(df.fbi_code.str.startswith("30"), "Suicide",
                                 np.where(df.fbi_code.str.startswith("31"), "Miscellaneous deaths",
                                 np.where(df.fbi_code.str.startswith("32"), "Nature call",
                                 np.where(df.fbi_code.str.startswith("33"), "Welfare check",
                                 np.where(df.fbi_code.str.startswith("34"), "Affray in progress",
                                 np.where(df.fbi_code.str.startswith("35"), "Alarm call",
                                 np.where(df.fbi_code.str.startswith("40"), "En-route to district",
                                 np.where(df.fbi_code.str.startswith("41"), "En-route to home", "Unidentified"))))))))))))))))))))))))))))))))))
    df["fbi_code_description"] = df["fbi_code_description"].astype("category")
                                                                     

def convert_coords_to_geometry(df):
    """
    Converts lat/lon attributes to more usable geometry objects
    """
    df['geometry'] = list(zip(df.longitude, df.latitude))
    df['geometry'] = df['geometry'].apply(Point)
    df = geopandas.GeoDataFrame(df, geometry='geometry')
    
    

def join_with_econ(df):
    """
    Joins crime data with revised_econ data
    """
    econ_df = pd.read_excel("../data/cfnai/cfnai-realtime-revised.xlsx")
    econ_df['month'] = econ_df.Month.dt.month
    econ_df['month'] = pd.to_numeric(econ_df.month, downcast="unsigned")
    econ_df['year'] = econ_df.Month.dt.year
    econ_df.drop("Month", inplace=True, axis=1)
    return pd.merge(df, econ_df) # Can't do this inplace :(


def etl_pipeline(df):
    """
    This is the whole pipeline.  
    Add function calls to mutate the inputted dataframe into something that we can work with.
    """
    remove_null_objects(df)
    add_dt_attributes(df)
    remove_irrelevant_attributes(df)
    add_fbi_code_description(df)
    convert_coords_to_geometry(df)
    df = join_with_econ(df)
    return df

## Check out the Results

### Original

In [11]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11216869,JB125823,2017-01-01 00:01:00+00:00,133XX S AVENUE N,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,False,False,433,4.0,10.0,55,02,,,2017,2018-01-29 15:56:21+00:00,,,
1,11715028,JC291189,2019-06-04 02:06:00+00:00,058XX W HIGGINS AVE,1020,ARSON,BY FIRE,VEHICLE NON-COMMERCIAL,False,False,1622,16.0,45.0,11,09,1136078.0,1932995.0,2019,2019-08-17 15:57:45+00:00,41.972312,-87.775001,"(41.972311647, -87.77500054)"
2,11835381,JC441615,2019-09-14 13:00:00+00:00,096XX S WOODLAWN AVE,1750,OFFENSE INVOLVING CHILDREN,CHILD ABUSE,RESIDENCE,False,True,511,5.0,8.0,50,08B,1185895.0,1841338.0,2019,2019-09-25 15:50:06+00:00,41.719763,-87.594715,"(41.719762924, -87.594714959)"
3,11900491,JC521476,2019-11-23 11:26:00+00:00,058XX W CATALPA AVE,1570,SEX OFFENSE,PUBLIC INDECENCY,RESIDENTIAL YARD (FRONT/BACK),False,False,1622,16.0,45.0,11,17,1136434.0,1936203.0,2019,2019-11-30 15:53:03+00:00,41.981108,-87.773614,"(41.981108308, -87.773614366)"
4,11909810,JC532857,2019-12-03 10:17:00+00:00,026XX N OAK PARK AVE,5000,OTHER OFFENSE,OTHER CRIME AGAINST PERSON,RESIDENCE,False,False,2512,25.0,36.0,18,26,1130641.0,1916715.0,2019,2019-12-10 15:54:50+00:00,41.927733,-87.795369,"(41.927733013, -87.795369474)"


#### Check for missing data to resolve them

In [12]:
# There are null values in location_description, x_coordinate, y_coordinate, latitude, longitude and location
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974710 entries, 0 to 974709
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   unique_key            974710 non-null  int64              
 1   case_number           974710 non-null  object             
 2   date                  974710 non-null  datetime64[ns, UTC]
 3   block                 974710 non-null  object             
 4   iucr                  974710 non-null  object             
 5   primary_type          974710 non-null  object             
 6   description           974710 non-null  object             
 7   location_description  970430 non-null  object             
 8   arrest                974710 non-null  bool               
 9   domestic              974710 non-null  bool               
 10  beat                  974710 non-null  int64              
 11  district              974709 non-null  float64      

In [13]:
# A lot of unknown location_descriptions
df[df.location_description.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
7,11953655,JD117234,2020-01-15 19:50:00+00:00,010XX E 93RD ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,413,4.0,8.0,47,11,1184943.0,1843530.0,2020,2020-01-22 15:52:29+00:00,41.7258,-87.598133,"(41.725800384, -87.598133258)"
11,11983041,JD153707,2018-04-03 15:45:00+00:00,036XX S LAKE PARK AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,212,2.0,4.0,36,11,,,2018,2020-02-17 15:47:53+00:00,,,
48,11024187,JA354113,2017-07-11 00:00:00+00:00,074XX W HORTENSE AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,1612,16.0,41.0,10,11,1125798.0,1941728.0,2017,2018-02-10 15:50:01+00:00,41.996454,-87.812607,"(41.996453514, -87.812607444)"
54,11095247,JA440362,2017-09-19 22:30:00+00:00,016XX E 84TH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,412,4.0,8.0,45,11,1188616.0,1849610.0,2017,2018-02-10 15:50:01+00:00,41.742398,-87.584485,"(41.742397647, -87.584485252)"
103,11779127,JC373692,2019-07-30 15:00:00+00:00,021XX W 32nd St,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,912,9.0,12.0,59,11,1162781.0,1883439.0,2019,2019-08-06 16:17:28+00:00,41.835807,-87.6782,"(41.835806759, -87.678200257)"


In [14]:
# x_coordinate, y_coordinate, latitude, longitude and location have null values all on same objects
df[df.x_coordinate.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11216869,JB125823,2017-01-01 00:01:00+00:00,133XX S AVENUE N,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,False,False,433,4.0,10.0,55,2,,,2017,2018-01-29 15:56:21+00:00,,,
11,11983041,JD153707,2018-04-03 15:45:00+00:00,036XX S LAKE PARK AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,212,2.0,4.0,36,11,,,2018,2020-02-17 15:47:53+00:00,,,
32,12199828,JD406835,2020-10-21 21:12:00+00:00,017XX E 87TH ST,1812,NARCOTICS,POSSESS - CANNABIS MORE THAN 30 GRAMS,STREET,True,False,412,4.0,8.0,48,18,,,2020,2020-10-28 15:58:23+00:00,,,
33,12212824,JD421525,2020-11-03 05:30:00+00:00,019XX W 35TH ST,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,LIBRARY,False,False,912,9.0,12.0,59,17,,,2020,2020-11-10 15:54:45+00:00,,,
97,11640612,JC206563,2018-06-01 00:01:00+00:00,067XX W RASCHER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,1613,16.0,41.0,10,11,,,2018,2019-04-01 16:21:13+00:00,,,


#### Check for redundant data to resolve them

In [15]:
# Great! No duplicate objects
df.duplicated().sum()

0

### Pipelined

In [16]:
cp = df.copy()

In [17]:
cp = etl_pipeline(cp);

In [18]:
cp.head()

Unnamed: 0,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,location,month,hour,dayofweek,weekend,fbi_code_description,geometry,CF,CF3,PI,EUH,CH,SOI
0,2019-06-04 02:06:00+00:00,058XX W HIGGINS AVE,1020,ARSON,BY FIRE,VEHICLE NON-COMMERCIAL,False,False,1622,16.0,45.0,11,9,1136078.0,1932995.0,2019,41.972312,-87.775001,"(41.972311647, -87.77500054)",6,2,1,False,Forgery,POINT (-87.77500 41.97231),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
1,2019-06-02 00:36:00+00:00,084XX S STONY ISLAND AVE,3731,INTERFERENCE WITH PUBLIC OFFICER,OBSTRUCTING IDENTIFICATION,STREET,True,False,412,4.0,8.0,45,24,1188267.0,1849517.0,2019,41.742151,-87.585767,"(41.742150773, -87.585766931)",6,0,6,True,Suspicious vehicle/person,POINT (-87.58577 41.74215),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
2,2019-06-13 22:50:00+00:00,052XX S KOLIN AVE,141A,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENTIAL YARD (FRONT/BACK),False,False,815,8.0,23.0,57,15,1148254.0,1869425.0,2019,41.797642,-87.731865,"(41.797641987, -87.731865163)",6,22,3,False,Domestic Violence,POINT (-87.73187 41.79764),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
3,2019-06-04 10:00:00+00:00,056XX N KERBS AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,1711,17.0,39.0,12,11,1144177.0,1937547.0,2019,41.984654,-87.745104,"(41.984653967, -87.745103783)",6,10,1,False,Shots Fired,POINT (-87.74510 41.98465),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077
4,2019-06-16 17:55:00+00:00,054XX S PULASKI RD,1206,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",STREET,True,False,815,8.0,23.0,62,11,1150625.0,1868330.0,2019,41.794591,-87.723199,"(41.794591289, -87.723198732)",6,17,6,True,Shots Fired,POINT (-87.72320 41.79459),0.149051,-0.173627,0.025143,0.018186,-0.021355,0.127077


In [18]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 923983 entries, 0 to 923982
Data columns (total 31 columns):
date                    923983 non-null datetime64[ns, UTC]
block                   923983 non-null object
iucr                    923983 non-null object
primary_type            923983 non-null object
description             923983 non-null object
location_description    923983 non-null object
arrest                  923983 non-null bool
domestic                923983 non-null bool
beat                    923983 non-null int64
district                923983 non-null float64
ward                    923983 non-null float64
community_area          923983 non-null int64
fbi_code                923983 non-null object
x_coordinate            923983 non-null float64
y_coordinate            923983 non-null float64
year                    923983 non-null int64
latitude                923983 non-null float64
longitude               923983 non-null float64
location                923983 

In [19]:
# Export cleaned data
# cp.to_csv('../data/cleaned_data.csv', index=False) # Uncommenting and running it will overwrite the DW