# ETL Pipeline- Working Copy

## Imports

In [1]:
# Std lib:
import os
import warnings
from dotenv import load_dotenv

# Querying data:
from google.cloud import bigquery

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

## Configure credentials

In [4]:
# Specify the file path location of your google project .json key in 
# a local .env file. The .env file will be excluded from .gitignore file. 
# An example .env file is included in repo, make edits and remove .example file type. 
# If you do not have python dotenv module, you will need to install it with:
# pip install python-dotenv  (also should be available for conda)

load_dotenv()
key = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

## Set up client

In [5]:
client = bigquery.Client()
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
table_ref = dataset_ref.table("crime")
table = client.get_table(table_ref)

## Query the Crime Table

In [6]:
# ENTER YOUR QUERY HERE:

MAX_GB = 4 # Change this if desired

QUERY = """
SELECT *
FROM `bigquery-public-data.chicago_crime.crime`
"""

In [7]:
def safe_query_to_dataframe(client, table, sql_query, max_gb=0):
    """
    Wrapper function for bigquery.client.query.  Will throw an error if the query exceeds the desired limit.
        
    params
        > client: a bigquery client object
        > table: a bigquery table object
        > sql_query (string): an SQL query on the table
        > max_gb (int): GB limit of query
        
    returns
        > Error: if query size exceeds limit
        > Dataframe: Dataframe representation of the query
    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    query_job = client.query(sql_query, job_config=job_config)
    gbs_used = query_job.total_bytes_processed / 1e9
    assert gbs_used < max_gb, f"This query will process {gbs_used} GB, which exceeds your desired limit of {max_gb} GB."
    query_job = client.query(sql_query)
    return query_job.to_dataframe()

In [8]:
df = safe_query_to_dataframe(client=client, table=table, sql_query=QUERY, max_gb=MAX_GB)

## Pipeline

In [9]:
# We can remove objects (ie. rows) with any null values since we have huge dataset with millions of rows
def remove_null_objects(df):
    """
    Removes objects with any null values -> less than 3000 rows dropped
    """
    df.dropna(inplace = True)
    
    
def add_dt_attributes(df):
    """
    Adds datetime attributes for easier querying
    """
    df["month"] = pd.to_numeric(df.date.dt.month, downcast="unsigned")
    df["hour"] = pd.to_numeric(df.date.dt.hour, downcast="unsigned")
    df["dayofweek"] = pd.to_numeric(df.date.dt.dayofweek, downcast="unsigned")
    df["weekend"] = df.date.dt.dayofweek >= 5
    

def remove_irrelevant_attributes(df): 
    """
    Remove attributes not needed for EDA or data mining 
    """
    df.drop(columns = ["unique_key", "case_number", "updated_on"], inplace = True)
    

def convert_coords_to_geometry(df):
    """
    Converts lat/lon attributes to more usable geometry objects
    """
    df['geometry'] = list(zip(df.longitude, df.latitude))
    df['geometry'] = df['geometry'].apply(Point)
    df = geopandas.GeoDataFrame(df, geometry='geometry')
    
    

def join_with_econ(df):
    """
    Joins crime data with revised_econ data
    """
    econ_df = pd.read_excel("../data/cfnai/cfnai-realtime-revised.xlsx")
    econ_df['month'] = econ_df.Month.dt.month
    econ_df['month'] = pd.to_numeric(econ_df.month, downcast="unsigned")
    econ_df['year'] = econ_df.Month.dt.year
    econ_df.drop("Month", inplace=True, axis=1)
    return pd.merge(df, econ_df) # Can't do this inplace :(

def join_with_crimetype(df):
    """
    Joins crime data with revised_econ data
    """
    ctype_df = pd.read_excel("../data/fbi_crime_types/crimetype_SRS.xlsx")
    return pd.merge(df, ctype_df) 


def etl_pipeline(df):
    """
    This is the whole pipeline.  
    Add function calls to mutate the inputted dataframe into something that we can work with.
    """
    remove_null_objects(df)
    add_dt_attributes(df)
    remove_irrelevant_attributes(df)
    convert_coords_to_geometry(df)
    df = join_with_econ(df)
    df = join_with_crimetype(df)
    return df

## Check out the Results

### Original

In [10]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,7924949,HT155305,2011-02-10 10:20:00+00:00,0000X E 103RD ST,031A,ROBBERY,ARMED: HANDGUN,SMALL RETAIL STORE,False,False,511,5.0,9.0,49.0,3,1178792.0,1836701.0,2011,2018-02-10 15:50:01+00:00,41.707203,-87.620872,"(41.707202662, -87.620871555)"
1,10501136,HZ242137,2016-04-27 12:40:00+00:00,0000X W 95TH ST,031A,ROBBERY,ARMED: HANDGUN,STREET,False,False,511,5.0,21.0,49.0,3,1177523.0,1841973.0,2016,2018-02-10 15:50:01+00:00,41.721698,-87.62536,"(41.72169846, -87.625359863)"
2,7017817,HR425842,2009-07-12 23:30:00+00:00,112XX S HALSTED ST,031A,ROBBERY,ARMED: HANDGUN,SIDEWALK,False,False,2233,22.0,34.0,49.0,3,1172995.0,1830219.0,2009,2018-02-28 15:56:25+00:00,41.689545,-87.642291,"(41.689544717, -87.642290571)"
3,10298425,HY486730,2015-11-02 18:54:00+00:00,001XX W 95TH ST,031A,ROBBERY,ARMED: HANDGUN,RESIDENCE,False,False,511,5.0,21.0,49.0,3,1176832.0,1841957.0,2015,2018-02-10 15:50:01+00:00,41.72167,-87.627891,"(41.72167012, -87.627891343)"
4,11392468,JB365556,2018-07-26 01:41:00+00:00,105XX S WABASH AVE,031A,ROBBERY,ARMED: HANDGUN,DRIVEWAY - RESIDENTIAL,False,False,512,5.0,9.0,49.0,3,1178441.0,1835069.0,2018,2018-08-02 15:53:00+00:00,41.702732,-87.622206,"(41.70273219, -87.622206261)"


#### Check for missing data to resolve them

In [11]:
# There are null values in location_description, x_coordinate, y_coordinate, latitude, longitude and location
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7227300 entries, 0 to 7227299
Data columns (total 22 columns):
 #   Column                Dtype              
---  ------                -----              
 0   unique_key            int64              
 1   case_number           object             
 2   date                  datetime64[ns, UTC]
 3   block                 object             
 4   iucr                  object             
 5   primary_type          object             
 6   description           object             
 7   location_description  object             
 8   arrest                bool               
 9   domestic              bool               
 10  beat                  int64              
 11  district              float64            
 12  ward                  float64            
 13  community_area        float64            
 14  fbi_code              object             
 15  x_coordinate          float64            
 16  y_coordinate          float64       

In [12]:
# A lot of unknown location_descriptions
df[df.location_description.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
377,11953655,JD117234,2020-01-15 19:50:00+00:00,010XX E 93RD ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,413,4.0,8.0,47.0,11,1184943.0,1843530.0,2020,2020-01-22 15:52:29+00:00,41.7258,-87.598133,"(41.725800384, -87.598133258)"
519,10509986,HZ250693,2016-05-02 17:00:00+00:00,109XX S DOTY AVE W,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,513,5.0,9.0,50.0,11,1185440.0,1832668.0,2016,2018-02-10 15:50:01+00:00,41.695982,-87.596653,"(41.695982095, -87.596653142)"
687,10890757,JA201378,2011-03-01 22:45:00+00:00,011XX E 87TH ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,412,4.0,8.0,47.0,11,,,2011,2017-03-27 15:52:22+00:00,,,
745,12066989,JD250755,2020-05-30 14:30:00+00:00,059XX W TOUHY AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1621,16.0,41.0,12.0,11,1136009.0,1947445.0,2020,2020-06-06 15:50:25+00:00,42.011965,-87.774908,"(42.011964859, -87.774907779)"
746,12068098,JD252011,2020-06-02 22:45:00+00:00,025XX E 91ST ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,423,4.0,7.0,48.0,11,1194880.0,1845125.0,2020,2020-06-09 15:49:44+00:00,41.729939,-87.561682,"(41.72993855, -87.561681705)"


In [13]:
# x_coordinate, y_coordinate, latitude, longitude and location have null values all on same objects
df[df.x_coordinate.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
375,4486855,HL788142,2005-12-14 00:22:00+00:00,037XX W 58TH PL,1305,CRIMINAL DAMAGE,CRIMINAL DEFACEMENT,RESIDENCE,True,False,822,8.0,14.0,62.0,14,,,2005,2015-08-17 15:03:40+00:00,,,
423,11246127,JB172644,2015-12-21 00:00:00+00:00,075XX W TOUHY AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,1611,16.0,41.0,9.0,26,,,2015,2018-03-04 15:57:39+00:00,,,
424,11608016,JC167065,2018-10-25 09:00:00+00:00,050XX S LAWNDALE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,821,8.0,14.0,62.0,11,,,2018,2019-02-27 16:21:40+00:00,,,
425,12207677,JD415958,2020-10-30 21:00:00+00:00,051XX S WENTWORTH AVE,1350,CRIMINAL TRESPASS,TO STATE SUP LAND,GOVERNMENT BUILDING / PROPERTY,True,False,225,2.0,3.0,37.0,26,,,2020,2020-11-06 15:50:21+00:00,,,
510,11653184,JC221949,2018-10-18 14:11:00+00:00,049XX W 64TH ST,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,False,813,8.0,13.0,64.0,26,,,2018,2019-04-13 16:01:56+00:00,,,


#### Check for redundant data to resolve them

In [14]:
# Great! No duplicate objects
df.duplicated().sum()

0

### Pipelined

In [15]:
cp = df.copy()

In [16]:
cp = etl_pipeline(cp);

In [17]:
cp.head()

Unnamed: 0,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,location,month,hour,dayofweek,weekend,geometry,CF,CF3,PI,EUH,CH,SOI,crime_code_category,index_crime,violent_crime,property_crime,crime_against_persons,crime_against_property,crime_against_society
0,2011-02-10 10:20:00+00:00,0000X E 103RD ST,031A,ROBBERY,ARMED: HANDGUN,SMALL RETAIL STORE,False,False,511,5.0,9.0,49.0,3,1178792.0,1836701.0,2011,41.707203,-87.620872,"(41.707202662, -87.620871555)",2,10,3,False,POINT (-87.62087 41.70720),-0.395995,-0.075168,-0.045717,0.188386,-0.250087,-0.288577,Robbery,True,True,False,False,True,False
1,2011-02-11 09:55:00+00:00,059XX S KILPATRICK AVE,031A,ROBBERY,ARMED: HANDGUN,STREET,False,False,813,8.0,13.0,64.0,3,1146129.0,1864908.0,2011,41.785287,-87.739772,"(41.78528715, -87.739772346)",2,9,4,False,POINT (-87.73977 41.78529),-0.395995,-0.075168,-0.045717,0.188386,-0.250087,-0.288577,Robbery,True,True,False,False,True,False
2,2011-02-18 05:40:00+00:00,042XX S WENTWORTH AVE,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,False,935,9.0,3.0,37.0,3,1175638.0,1876724.0,2011,41.817101,-87.631225,"(41.817101454, -87.631225183)",2,5,4,False,POINT (-87.63123 41.81710),-0.395995,-0.075168,-0.045717,0.188386,-0.250087,-0.288577,Robbery,True,True,False,False,True,False
3,2011-02-08 16:05:00+00:00,036XX S WELLS ST,0320,ROBBERY,STRONGARM - NO WEAPON,STREET,False,False,925,9.0,11.0,34.0,3,1175213.0,1880919.0,2011,41.828622,-87.632659,"(41.82862242, -87.632658693)",2,16,1,False,POINT (-87.63266 41.82862),-0.395995,-0.075168,-0.045717,0.188386,-0.250087,-0.288577,Robbery,True,True,False,False,True,False
4,2011-02-08 08:45:00+00:00,002XX W 35TH ST,0320,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,924,9.0,11.0,34.0,3,1175346.0,1881761.0,2011,41.83093,-87.632146,"(41.830929961, -87.632145515)",2,8,1,False,POINT (-87.63215 41.83093),-0.395995,-0.075168,-0.045717,0.188386,-0.250087,-0.288577,Robbery,True,True,False,False,True,False


In [18]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6508417 entries, 0 to 6508416
Data columns (total 37 columns):
 #   Column                  Dtype              
---  ------                  -----              
 0   date                    datetime64[ns, UTC]
 1   block                   object             
 2   iucr                    object             
 3   primary_type            object             
 4   description             object             
 5   location_description    object             
 6   arrest                  bool               
 7   domestic                bool               
 8   beat                    int64              
 9   district                float64            
 10  ward                    float64            
 11  community_area          float64            
 12  fbi_code                object             
 13  x_coordinate            float64            
 14  y_coordinate            float64            
 15  year                    int64              
 16  

In [40]:
# get external file path for data
path_to_data = os.environ.get('CLEAN_DATA')

# Export cleaned data - Uncomment to run, will overwrite existing file. 
# cp.to_csv(path_to_data, index=False)