In [1]:
# default_exp StatsCanada

# Merge Stats Canada

> Merge scraped stats Canada data based on region.

In [2]:
#hide
from nbdev.showdoc import *
from nbdev import *

# Config Script

Reading in the config parameters as variables. \
When building the package with `nbdev_build` anything cells with the `#export` tag will be exported to the default script which is `StatsCanada.py`.  
Otherwise a module name can be specified. 

Note: the module being exported to must already exist. However it can be just a template.

In [3]:
# export config
from pathlib import Path
import json

config_file_path = Path("config.json")
f = open(config_file_path,"r")
config = json.load(f)

In [4]:
# export config
stats_can_regions = config['stats_canada']['stats_can_regions']
stats_can_feature_by_ids_dir =config['stats_canada']['stats_can_feature_by_ids_dir']
target_path = config["target_path"]
target_features = config['target_features']
feature_encoding_map_dir = config["feature_encoding_map_dir"]
polygon_path = config["polygon_path"]
target_df_regions = config["target_df_regions"]

# Preparing Logger

Struct log for logging merging pipeline progress

In [5]:
# export logging
import datetime
import logging
import sys
from structlog import wrap_logger
from structlog.processors import JSONRenderer
from structlog.stdlib import filter_by_level

def add_timestamp(_, __, event_dict):
    """
    Add timestamp to a structlog entry
    Args:
        event_dict: structlog event_dict

    Returns:
        event_dict: modified structlog event_dict, now includes a timestamp
    """
    event_dict["timestamp"] = datetime.datetime.utcnow()
    return event_dict


def logging_setup(log_level='INFO'):
    """
    Set up standard structlog logger
    Args:
        log_level: string, defined the logging level. Can be: 'INFO', 'WARNING'

    Returns:
        logger: instantiated logger
    """

    #     logging setup. Import log level from config.json

    logging.basicConfig(stream=sys.stdout,format="%(message)s", level=log_level)
    logger = wrap_logger(
        logging.getLogger(__name__),
        processors=[
            filter_by_level,
            add_timestamp,
            JSONRenderer(indent=1, sort_keys=True)
        ]
    )

    return logger

# Preprocessing of Target Values

Reading in the target values and mapping to polygons target locations to polygons\
Returns a dictionary of geo dataframes

In [6]:
# export target_values_preprocessing

import pandas as pd
import numpy as np
import os
import re
import geopandas as gp
import warnings 
warnings.filterwarnings("ignore")

In [7]:
# export target_values_preprocessing

def _map_target_polygons(target,target_file,poly_file,target_df_regions):
    
    ''' 
    given the target as str as input this function reads in the appropriate final targets csv from target_values_paths
    and appropriate polygons from target_polygon_paths and merges them into a single geo dataframe
    
    parameters:
    ---------
    target-> str
    target_values_paths -> dict[str,str] paths to the target csvs
    target_polygon_paths -> dict[str,str] paths to the polygon .gpkg files
    target_df_regions -> list[str]
    
    ouput:
    --------
    mapped_df -> gp.GeoDataFrame
    
    target csv must have Community and Province columns. 
    '''    
    
    df = pd.read_csv(target_file)
    gdf = gp.read_file(poly_file)
    
    mapped_df = df.merge(gdf,how = 'left',left_on=target_df_regions,right_on =[target,'PRNAME'])
    mapped_df.dropna(subset = ['geometry'],inplace = True)
    mapped_df.drop(columns = [target,'PRNAME'],inplace = True)
    mapped_df = gp.GeoDataFrame(mapped_df,geometry = 'geometry')
    return mapped_df

In [8]:
# export target_values_preprocessing

def get_target_df(target_features,target_path,polygon_path,target_df_regions,logger):    
    ''' 
    main function for this script to return a dictionary of dataframes mapped to each target,
    
    parameters:
    ------------
    target_features -> gp.GeoDataFrame
    target_values_paths -> dict[str,str] paths to the target csvs
    target_polygon_paths -> dict[str,str] paths to the polygon .gpkg files
    target_df_regions -> list[str]
    
    returns:
    ---------
    mapped_target -> dict[str,gp.GeoDataFrame]
    '''        
    logger.info(f"Preprocessing target features")
    mapped_targets = {}

    for target in ["homeless","suicide","violence"]:
        logger.info(f"mapping {target} polygon")
        
        poly_file = Path(polygon_path)/f"{target}_target_polygons.gpkg"
        target_file = Path(target_path)/f"{target}_targets_final.csv"
        
        df = _map_target_polygons(target,target_file,poly_file,target_df_regions)
            
        mapped_targets[target] = df
        logger.info(f"{target} polygon mapping completed")
    return mapped_targets

Example of exploratory testing \
\
We are able to experinment with and visually validate the modules, similarly to EDA code where the ML dev is constantly iterating. This will not be exported to a `.py` file during the build.

In [9]:
logger = logging_setup(log_level='INFO')

In [10]:
mapped_targets = get_target_df(target_features,
                               target_path,
                               polygon_path,
                               target_df_regions,
                               logger)

{
 "event": "Preprocessing target features",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 32083)"
}
{
 "event": "mapping homeless polygon",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 33108)"
}
Failed to auto identify EPSG: 7
{
 "event": "homeless polygon mapping completed",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 119878)"
}
{
 "event": "mapping suicide polygon",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 120326)"
}
Failed to auto identify EPSG: 7
{
 "event": "suicide polygon mapping completed",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 213081)"
}
{
 "event": "mapping violence polygon",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 214078)"
}
Failed to auto identify EPSG: 7
{
 "event": "violence polygon mapping completed",
 "timestamp": "datetime.datetime(2021, 2, 12, 5, 59, 30, 288904)"
}


In [11]:
mapped_targets['homeless'].head()

Unnamed: 0,Year,Province,Community,PIT Total,PIT-Sheltered,PIT-Unsheltered,DV_target,CSDNAME,geometry
0,2002,British Columbia,Burnaby,18,14,44,Burnaby,Burnaby,"POLYGON ((4026984.377 2004001.803, 4027065.283..."
1,2002,British Columbia,Delta/White Rock,11,0,0,,Delta,"MULTIPOLYGON (((4027416.129 1971099.869, 40275..."
2,2002,British Columbia,Langley,18,0,0,,Langley,"POLYGON ((4044035.820 1983268.574, 4044044.823..."
3,2002,British Columbia,Maple Ridge/Pitt Meadows,66,0,0,,Maple Ridge,"POLYGON ((4060993.469 1993881.186, 4061981.717..."
4,2002,British Columbia,New Westminster,74,0,0,New Westminster,New Westminster,"POLYGON ((4031000.411 1995145.954, 4031031.369..."


In [12]:
mapped_targets['suicide'].head()

Unnamed: 0,Year,Community,Year_mapping,suicide_value,Province,CSDNAME,geometry
0,2000,St. John's,2000/2002,35,Newfoundland and Labrador,Bauline,"MULTIPOLYGON (((8991919.260 2135215.551, 89918..."
1,2000,Halifax,2000/2002,80,Nova Scotia,Beaver Lake 17,"MULTIPOLYGON (((8466571.394 1473434.837, 84665..."
2,2000,Moncton,2000/2002,35,New Brunswick,Coverdale,"POLYGON ((8226217.329 1576063.031, 8226255.246..."
3,2000,Saint John,2000/2002,35,New Brunswick,Grand Bay-Westfield,"MULTIPOLYGON (((8201256.689 1415099.943, 82012..."
4,2000,Saguenay,2000/2002,95,Quebec,Begin,"POLYGON ((7718195.920 1638696.611, 7718179.649..."


In [13]:
mapped_targets['violence'].head()

Unnamed: 0,Year,Province,Community,violence_rate,nat_avg,HOMELESS_T,CSDNAME,geometry
0,2008,British Columbia,Abbotsford,1188,1357.159959,Abbotsford,Abbotsford,"POLYGON ((4059954.206 1963195.571, 4059965.434..."
1,2008,Ontario,Guelph,724,1357.159959,Guelph-Wellington,Guelph,"POLYGON ((7156320.820 906917.497, 7156559.923 ..."
2,2008,New Brunswick,Saint John,2271,1357.159959,Saint John,Saint John,"MULTIPOLYGON (((8201256.689 1415099.943, 82012..."
3,2008,Saskatchewan,Saskatoon,2002,1357.159959,Saskatoon,Saskatoon,"POLYGON ((5208941.440 1902191.771, 5208830.854..."
4,2008,Ontario,Greater Sudbury,1087,1357.159959,Greater Sudbury,Greater Sudbury,"MULTIPOLYGON (((7464186.254 1057517.597, 74641..."
