# ETL Pipeline- Working Copy

## Imports

In [3]:
# Std lib:
import os
import warnings

# Querying data:
from google.cloud import bigquery

# Data manipulation:
import numpy as np
import pandas as pd
import geopandas
from shapely.geometry import Point, Polygon

# Visualization:
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline
style.use('seaborn')

# Display all columns in Jupyter:
from IPython.display import display
pd.options.display.max_columns = None

# Filter Warnings
warnings.filterwarnings('ignore')

## Configure credentials

In [4]:
key_location = # Your key location here (remove before commit). Maybe we can use os to configure this automatically?
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_location

## Set up client

In [5]:
client = bigquery.Client()
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)
table_ref = dataset_ref.table("crime")
table = client.get_table(table_ref)

## Query the Crime Table

In [6]:
# ENTER YOUR QUERY HERE:

MAX_GB = 4 # Change this if desired

QUERY = """
SELECT *
FROM `bigquery-public-data.chicago_crime.crime`
WHERE year in (2017, 2018, 2019, 2020)
"""

In [7]:
def safe_query_to_dataframe(client, table, sql_query, max_gb=0):
    """
    Wrapper function for bigquery.client.query.  Will throw an error if the query exceeds the desired limit.
        
    params
        > client: a bigquery client object
        > table: a bigquery table object
        > sql_query (string): an SQL query on the table
        > max_gb (int): GB limit of query
        
    returns
        > Error: if query size exceeds limit
        > Dataframe: Dataframe representation of the query
    """
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    query_job = client.query(sql_query, job_config=job_config)
    gbs_used = query_job.total_bytes_processed / 1e9
    assert gbs_used < max_gb, f"This query will process {gbs_used} GB, which exceeds your desired limit of {max_gb} GB."
    query_job = client.query(sql_query)
    return query_job.to_dataframe()

In [8]:
df = safe_query_to_dataframe(client=client, table=table, sql_query=QUERY, max_gb=MAX_GB)

## Pipeline

In [9]:
# We can remove objects (ie. rows) with any null values since we have huge dataset with millions of rows
def remove_null_objects(df):
    """
    Removes objects with any null values -> less than 3000 rows dropped
    """
    df.dropna(inplace = True)
    
    
def add_dt_attributes(df):
    """
    Adds datetime attributes for easier querying
    """
    df["month"] = pd.to_numeric(df.date.dt.month, downcast="unsigned")
    df["hour"] = pd.to_numeric(df.date.dt.hour, downcast="unsigned")
    df["dayofweek"] = pd.to_numeric(df.date.dt.dayofweek, downcast="unsigned")
    df["weekend"] = df.date.dt.dayofweek >= 5
    

def remove_irrelevant_attributes(df): 
    """
    Remove attributes not needed for EDA or data mining 
    """
    df.drop(columns = ["unique_key", "case_number", "updated_on"], inplace = True)
    

def add_fbi_code_description(df):
    """
    Add fbi_code_description attribute for details and querying
    """
    df["fbi_code_description"] = np.where(df.fbi_code.str.startswith("01"), "Murder", 
                                 np.where(df.fbi_code.str.startswith("02"), "Rape", 
                                 np.where(df.fbi_code.str.startswith("03"), "Robbery",
                                 np.where(df.fbi_code.str.startswith("04"), "Assault",
                                 np.where(df.fbi_code.str.startswith("05"), "Burglary",
                                 np.where(df.fbi_code.str.startswith("06"), "Theft",
                                 np.where(df.fbi_code.str.startswith("07"), "Auto Theft",
                                 np.where(df.fbi_code.str.startswith("09"), "Forgery",
                                 np.where(df.fbi_code.str.startswith("10"), "Fraud/Embezzlement",
                                 np.where(df.fbi_code.str.startswith("11"), "Shots Fired",
                                 np.where(df.fbi_code.str.startswith("13"), "Prostitution",
                                 np.where(df.fbi_code.str.startswith("14"), "Indecent Exposure",
                                 np.where(df.fbi_code.str.startswith("15"), "Domestic Violence",
                                 np.where(df.fbi_code.str.startswith("16"), "Narcotics",
                                 np.where(df.fbi_code.str.startswith("17"), "Violation of liquor laws",
                                 np.where(df.fbi_code.str.startswith("18"), "Intoxicated subject",
                                 np.where(df.fbi_code.str.startswith("19"), "Disorderly conduct",
                                 np.where(df.fbi_code.str.startswith("20"), "Vagrants",
                                 np.where(df.fbi_code.str.startswith("21"), "Gambling",
                                 np.where(df.fbi_code.str.startswith("22"), "DWI",
                                 np.where(df.fbi_code.str.startswith("23"), "Reckless driving",
                                 np.where(df.fbi_code.str.startswith("24"), "Suspicious vehicle/person",
                                 np.where(df.fbi_code.str.startswith("26"), "All others: arson/vandalism",
                                 np.where(df.fbi_code.str.startswith("27"), "Are you in trouble",
                                 np.where(df.fbi_code.str.startswith("28"), "Frequent patrol",
                                 np.where(df.fbi_code.str.startswith("29"), "Dog bite",
                                 np.where(df.fbi_code.str.startswith("30"), "Suicide",
                                 np.where(df.fbi_code.str.startswith("31"), "Miscellaneous deaths",
                                 np.where(df.fbi_code.str.startswith("32"), "Nature call",
                                 np.where(df.fbi_code.str.startswith("33"), "Welfare check",
                                 np.where(df.fbi_code.str.startswith("34"), "Affray in progress",
                                 np.where(df.fbi_code.str.startswith("35"), "Alarm call",
                                 np.where(df.fbi_code.str.startswith("40"), "En-route to district",
                                 np.where(df.fbi_code.str.startswith("41"), "En-route to home", "Unidentified"))))))))))))))))))))))))))))))))))
    df["fbi_code_description"] = df["fbi_code_description"].astype("category")
                                                                     

def convert_coords_to_geometry(df):
    """
    Converts lat/lon attributes to more usable geometry objects
    """
    df['geometry'] = list(zip(df.longitude, df.latitude))
    df['geometry'] = df['geometry'].apply(Point)
    df = geopandas.GeoDataFrame(df, geometry='geometry')
    
    

def join_with_econ(df):
    """
    Joins crime data with revised_econ data
    """
    econ_df = pd.read_excel("../data/cfnai/cfnai-realtime-revised.xlsx")
    econ_df['month'] = econ_df.Month.dt.month
    econ_df['month'] = pd.to_numeric(econ_df.month, downcast="unsigned")
    econ_df['year'] = econ_df.Month.dt.year
    econ_df.drop("Month", inplace=True, axis=1)
    return pd.merge(df, econ_df) # Can't do this inplace :(


def etl_pipeline(df):
    """
    This is the whole pipeline.  
    Add function calls to mutate the inputted dataframe into something that we can work with.
    """
    remove_null_objects(df)
    add_dt_attributes(df)
    remove_irrelevant_attributes(df)
    add_fbi_code_description(df)
    convert_coords_to_geometry(df)
    df = join_with_econ(df)
    return df

## Check out the Results

### Original

In [10]:
df.head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11262006,JB193516,2017-12-01 00:01:00+00:00,002XX E 132ND ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,533,5.0,9.0,54,11,,,2017,2018-03-21 16:01:31+00:00,,,
1,11337629,JB292573,2018-06-04 14:00:00+00:00,023XX S CANAL ST,880,THEFT,PURSE-SNATCHING,SIDEWALK,False,False,914,9.0,25.0,34,06,,,2018,2018-06-11 15:52:24+00:00,,,
2,11427525,JB410819,2017-05-01 00:00:00+00:00,052XX S LAWNDALE AVE,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,822,8.0,23.0,62,17,,,2017,2018-08-28 16:01:15+00:00,,,
3,11699019,JC277471,2019-05-24 22:48:00+00:00,130XX S EXCHANGE AVE,498,BATTERY,AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET S...,RESIDENCE,False,True,433,4.0,10.0,55,04B,1197765.0,1819090.0,2019,2019-06-30 15:56:27+00:00,41.658424,-87.551978,"(41.658424474, -87.551978488)"
4,11699335,JC275733,2019-04-26 17:21:00+00:00,038XX W 47TH ST,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,False,False,821,8.0,14.0,57,02,1151639.0,1873196.0,2019,2019-06-30 15:56:27+00:00,41.807924,-87.719353,"(41.807924493, -87.719352913)"


#### Check for missing data to resolve them

In [11]:
# There are null values in location_description, x_coordinate, y_coordinate, latitude, longitude and location
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973516 entries, 0 to 973515
Data columns (total 22 columns):
unique_key              973516 non-null int64
case_number             973516 non-null object
date                    973516 non-null datetime64[ns, UTC]
block                   973516 non-null object
iucr                    973516 non-null object
primary_type            973516 non-null object
description             973516 non-null object
location_description    969244 non-null object
arrest                  973516 non-null bool
domestic                973516 non-null bool
beat                    973516 non-null int64
district                973515 non-null float64
ward                    973488 non-null float64
community_area          973516 non-null int64
fbi_code                973516 non-null object
x_coordinate            951616 non-null float64
y_coordinate            951616 non-null float64
year                    973516 non-null int64
updated_on              973516 non

In [12]:
# A lot of unknown location_descriptions
df[df.location_description.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
25,12036675,JD215832,2020-04-22 10:00:00+00:00,049XX W CARMEN AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1623,16.0,45.0,12,11,1142410.0,1933454.0,2020,2020-04-29 15:55:20+00:00,41.973456,-87.751705,"(41.973455578, -87.751704899)"
63,11489624,JB492634,2018-10-26 15:05:00+00:00,062XX N HAMLIN AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1711,17.0,39.0,13,11,1149906.0,1941124.0,2018,2018-11-02 16:18:21+00:00,41.99436,-87.723939,"(41.99435956, -87.723939394)"
83,11216825,JB133133,2017-08-12 00:00:00+00:00,111XX S Avenue G,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,433,4.0,10.0,52,11,,,2017,2018-01-29 15:56:21+00:00,,,
108,12055419,JD237296,2020-05-19 19:20:00+00:00,063XX N LENOX AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,,False,False,1621,16.0,39.0,12,11,1141943.0,1941689.0,2020,2020-05-26 15:49:34+00:00,41.996062,-87.753217,"(41.996061721, -87.753217084)"
115,12175069,JD377655,2020-09-01 13:00:00+00:00,057XX N KARLOV AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,1711,17.0,39.0,13,11,1148036.0,1937484.0,2020,2020-09-27 15:48:13+00:00,41.984407,-87.730912,"(41.984407487, -87.730912324)"


In [13]:
# x_coordinate, y_coordinate, latitude, longitude and location have null values all on same objects
df[df.x_coordinate.isnull()].head()

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,11262006,JB193516,2017-12-01 00:01:00+00:00,002XX E 132ND ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,APARTMENT,False,False,533,5.0,9.0,54,11,,,2017,2018-03-21 16:01:31+00:00,,,
1,11337629,JB292573,2018-06-04 14:00:00+00:00,023XX S CANAL ST,880,THEFT,PURSE-SNATCHING,SIDEWALK,False,False,914,9.0,25.0,34,06,,,2018,2018-06-11 15:52:24+00:00,,,
2,11427525,JB410819,2017-05-01 00:00:00+00:00,052XX S LAWNDALE AVE,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,822,8.0,23.0,62,17,,,2017,2018-08-28 16:01:15+00:00,,,
36,12199115,JD405872,2020-10-21 02:24:00+00:00,061XX W HIGGINS AVE,454,BATTERY,"AGGRAVATED P.O. - HANDS, FISTS, FEET, NO / MIN...",STREET,False,True,1622,16.0,45.0,11,08B,,,2020,2020-10-28 15:58:23+00:00,,,
37,12208113,JD416338,2020-10-30 12:30:00+00:00,051XX N NAGLE AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,RESIDENCE,False,False,1622,16.0,45.0,11,11,,,2020,2020-11-06 15:50:21+00:00,,,


#### Check for redundant data to resolve them

In [14]:
# Great! No duplicate objects
df.duplicated().sum()

0

### Pipelined

In [15]:
cp = df.copy()

In [16]:
cp = etl_pipeline(cp);

In [17]:
cp.head()

Unnamed: 0,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,location,month,hour,dayofweek,weekend,fbi_code_description,geometry,CF,CF3,PI,EUH,CH,SOI
0,2019-05-24 22:48:00+00:00,130XX S EXCHANGE AVE,498,BATTERY,AGGRAVATED DOMESTIC BATTERY: HANDS/FIST/FEET S...,RESIDENCE,False,True,433,4.0,10.0,55,04B,1197765.0,1819090.0,2019,41.658424,-87.551978,"(41.658424474, -87.551978488)",5,22,4,False,Assault,POINT (-87.551978488 41.658424474),-0.038356,-0.212009,0.00874,-0.014736,-0.027105,-0.005254
1,2019-05-18 06:00:00+00:00,008XX E 100TH PL,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,OTHER,False,False,511,5.0,8.0,50,11,1183760.0,1838243.0,2019,41.71132,-87.602631,"(41.711319868, -87.602630978)",5,6,5,True,Shots Fired,POINT (-87.60263097799999 41.711319868),-0.038356,-0.212009,0.00874,-0.014736,-0.027105,-0.005254
2,2019-05-10 12:00:00+00:00,059XX N OCONTO AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,OTHER,False,False,1612,16.0,41.0,10,11,1126974.0,1938858.0,2019,41.988558,-87.808346,"(41.988558252, -87.80834608)",5,12,4,False,Shots Fired,POINT (-87.80834608000001 41.988558252),-0.038356,-0.212009,0.00874,-0.014736,-0.027105,-0.005254
3,2019-05-18 09:30:00+00:00,052XX S ARCHER AVE,1206,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,True,False,815,8.0,23.0,57,11,1146707.0,1870103.0,2019,41.799532,-87.737521,"(41.799532089, -87.737521099)",5,9,5,True,Shots Fired,POINT (-87.73752109900001 41.799532089),-0.038356,-0.212009,0.00874,-0.014736,-0.027105,-0.005254
4,2019-05-09 09:00:00+00:00,051XX S WENTWORTH AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,POLICE FACILITY/VEH PARKING LOT,False,False,225,2.0,3.0,37,11,1175826.0,1871120.0,2019,41.801719,-87.630704,"(41.80171934, -87.630703621)",5,9,3,False,Shots Fired,POINT (-87.63070362099999 41.80171934),-0.038356,-0.212009,0.00874,-0.014736,-0.027105,-0.005254


In [18]:
cp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 923983 entries, 0 to 923982
Data columns (total 31 columns):
date                    923983 non-null datetime64[ns, UTC]
block                   923983 non-null object
iucr                    923983 non-null object
primary_type            923983 non-null object
description             923983 non-null object
location_description    923983 non-null object
arrest                  923983 non-null bool
domestic                923983 non-null bool
beat                    923983 non-null int64
district                923983 non-null float64
ward                    923983 non-null float64
community_area          923983 non-null int64
fbi_code                923983 non-null object
x_coordinate            923983 non-null float64
y_coordinate            923983 non-null float64
year                    923983 non-null int64
latitude                923983 non-null float64
longitude               923983 non-null float64
location                923983 

In [19]:
# Export cleaned data
# cp.to_csv('../data/cleaned_data.csv', index=False) # Uncommenting and running it will overwrite the DW