# IML 2018 Modeled

In [1]:
dataset_id = "iml_2018_modeled"

In [2]:
!bq --location=US mk --dataset {dataset_id}

Dataset 'sashimi-266523:iml_2018_modeled' successfully created.


## Tables from Original Public Resources Table

In [11]:
%%bigquery
create table iml_2018_modeled.public_resources as
select MID, DISCIPL, COMMONNAME, LEGALNAME, ADSTREET, ADCITY, ADSTATE, 
cast(ADZIP5 as STRING) as ZIPCODE, 
PHONE, WEBURL, 
cast(replace(INCOME15, ' ', '0') as FLOAT64) as INCOME, 
cast(replace(REVENUE15, ' ', '0') as FLOAT64) as REVENUE, 
LONGITUDE, LATITUDE
from `iml_2018_staging.public_resources` 

In [66]:
%%bigquery
select count(*) from `iml_2018_modeled.public_resources`  

Unnamed: 0,f0_
0,7429


In [67]:
%%bigquery
select count(distinct MID) from `iml_2018_modeled.public_resources`  

Unnamed: 0,f0_
0,7429


# Process Data with Beam

In [2]:
% run Public_Resources_Beam.py

  experiments = p.options.view_as(DebugOptions).experiments or []
INFO:apache_beam.runners.direct.direct_runner:Running pipeline with DirectRunner.
INFO:apache_beam.io.gcp.bigquery_tools:Using location 'US' from table <TableReference
 datasetId: 'iml_2018_modeled'
 projectId: 'sashimi-266523'
 tableId: 'public_resources'> referenced by query SELECT * FROM iml_2018_modeled.public_resources limit 50
INFO:apache_beam.io.gcp.bigquery_tools:Created table sashimi-266523.iml_2018_modeled.Public_Resources_Beam with schema <TableSchema
 fields: [<TableFieldSchema
 fields: []
 mode: 'NULLABLE'
 name: 'MID'
 type: 'INTEGER'>, <TableFieldSchema
 fields: []
 mode: 'NULLABLE'
 name: 'DISCIPL'
 type: 'STRING'>, <TableFieldSchema
 fields: []
 mode: 'NULLABLE'
 name: 'COMMONNAME'
 type: 'STRING'>, <TableFieldSchema
 fields: []
 mode: 'NULLABLE'
 name: 'LEGALNAME'
 type: 'STRING'>, <TableFieldSchema
 fields: []
 mode: 'NULLABLE'
 name: 'ADSTREET'
 type: 'STRING'>, <TableFieldSchema
 fields: []
 mode: 'N

In [4]:
% run Public_Resources_beam_dataflow.py

  kms_key=transform.kms_key))


# Verify Data Integrity

In [14]:
# Check Primary Key Constraint

In [8]:
%%bigquery
select count(*) from `iml_2018_modeled.Public_Resources_Beam_DF`  

Unnamed: 0,f0_
0,7429


In [9]:
%%bigquery
select count(distinct MID) from `iml_2018_modeled.Public_Resources_Beam_DF`  

Unnamed: 0,f0_
0,7429


In [15]:
# Check all zipcode has 5 digits

In [13]:
%%bigquery
select count(*) from `iml_2018_modeled.Public_Resources_Beam_DF` 
where length(ZIPCODE) < 5

Unnamed: 0,f0_
0,0


In [22]:
# Check Foreign Key Constraints
# 14 violations

In [23]:
%%bigquery
select *
from `iml_2018_modeled.Public_Resources_Beam_DF` p
left join `uds_mapper_modeled.ZIP_To_ZCTA5_Beam_DF` m
using(ZIPCODE)
where m.ZIPCODE is null

Unnamed: 0,ZIPCODE,MID,DISCIPL,COMMONNAME,LEGALNAME,ADSTREET,ADCITY,ADSTATE,PHONE,WEBURL,INCOME,REVENUE,LONGITUDE,LATITUDE,STATE,ZCTA5
0,82702,8409504268,ART,UNIVERSITY OF WYOMING ART MUSEUM,UNIVERSITY OF WYOMING,2111 EAST WILLETT DRIVE,LARAMIE,WY,3077666622,HTTP://WWW.UWYO.EDU/ARTMUSEUM/,0.0,0.0,-105.60719,41.42252,,
1,2147,8402500451,ART,MASSACHUSETTS MUSEUM OF CONTEMPORARY ART,MASSACHUSETTS MUSEUM OF CONTEMPORARY ART FOUND...,87 MARSHALL ST NORTH ADAMS,NORTH ADAMS,MA,4136622111,HTTP://WWW.MASSMOCA.ORG,9169801.0,7912229.0,-73.11297,42.70469,,
2,3039,8401300366,ART,HIGH MUSEUM OF ART,ROBERT W WOODRUFF ARTS CENTER INC,1280 PEACHTREE ST NE,ATLANTA,GA,4047334400,HTTP://WWW.HIGH.ORG,304880380.0,105256535.0,-84.38811,33.7897,,
3,34326,8409504032,ART,SELBY GALLERY,RINGLING COLLEGE OF ART AND DESIGN INC,2700 NORTH TAMIAMI TRAIL,SARASOTA,FL,9413597563,HTTP://WWW.RINGLING.EDU/INDEX.PHP?ID=171,90400870.0,73460825.0,-82.54901,27.35991,,
4,92634,8400600449,ART,FULLERTON ART GALLERY,FULLERTON MUSEUM CENTER ASSOCIATION,301 N POMONA AVE,FULLERTON,CA,7147386545,HTTP://WWW.CITYOFFULLERTON.COM,251845.0,185533.0,-117.92217,33.87254,,
5,85273,8409501998,BOT,BOYCE THOMPSON SOUTHWESTERN ARBORETUM,BOYCE THOMPSON SOUTHWESTERN ARBORETUM,37615 U S 60,SUPERIOR,AZ,5206892723,HTTP://AZSTATEPARKS.COM/PARKS/BOTH/,0.0,0.0,-111.10269,33.28859,,
6,26503,8405400079,HST,MORGANTOWN HISTORY MUSEUM,FRIENDS OF MORGANTOWN HISTORYMUSEUM,111 HIGH STREET,MORGATOWN,WV,3043191800,HTTP://WWW.MORGANTOWNHISTORYMUSEUM.ORG/,0.0,0.0,-79.9569,39.62767,,
7,95000,8400601553,ZAW,MARINE MAMMAL CENTER,MARINE MAMMAL CENTER,2000 BUNKER ROAD FORT C,SAUSALITO,CA,4152897325,HTTP://WWW.MARINEMAMMALCENTER.ORG,10278218.0,9448577.0,-122.49982,37.86264,,
8,3139,8409502883,ART,BRENAU UNIVERSITY GALLERIES,BRENAU UNIVERSITY INC,BRENAU UNIVERSITY NORTH ATLANTA CAMPUS,GAINESVILLE,GA,7705346299,,54576928.0,53787311.0,-83.82195,34.30214,,
9,12676,8409401014,CMU,NORTH COUNTRY CHILDREN'S MUSEUM,NORTH COUNTRY CHILDREN'S MUSEUM,41 ELM STREET SUITE 106,POTSDAM,NY,3152614604,,0.0,0.0,-74.97721,44.66975,,


In [24]:
%%bigquery
select *
from `acs_2018_modeled.Income_Beam_DF` i 
left join `uds_mapper_modeled.ZIP_To_ZCTA5_Beam_DF` m using(ZCTA5)
left join `iml_2018_modeled.Public_Resources_Beam_DF` p using(ZIPCODE)


Unnamed: 0,ZIPCODE,ZCTA5,Income_Less_10k,Income_10k_14k,Income_15k_24k,Income_25k_34k,Income_35k_49k,Income_50k_74k,Income_75k_99k,Income_100k_149k,...,LEGALNAME,ADSTREET,ADCITY,ADSTATE,PHONE,WEBURL,INCOME,REVENUE,LONGITUDE,LATITUDE
0,02641,02641,15.2,0.0,0.0,23.2,11.6,0.0,7.1,0.0,...,,,,,,,,,,
1,26342,26342,30.0,0.0,16.0,0.0,19.0,18.0,9.0,0.0,...,,,,,,,,,,
2,12736,12736,31.3,0.0,12.5,0.0,31.3,0.0,15.6,0.0,...,,,,,,,,,,
3,16941,16941,13.2,0.0,23.7,5.3,31.6,0.0,18.4,0.0,...,,,,,,,,,,
4,99733,99733,20.0,20.0,12.0,12.0,24.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6823,81125,81125,11.5,8.4,8.5,16.8,15.7,16.2,12.6,4.6,...,,,,,,,,,,
6824,65793,65793,11.2,6.6,10.6,20.1,12.9,14.8,9.3,11.1,...,,,,,,,,,,
6825,70514,70514,10.8,1.5,13.8,13.9,17.5,15.4,13.0,7.0,...,,,,,,,,,,
6826,30083,30083,10.4,6.3,13.1,13.6,18.1,16.4,10.9,8.2,...,,,,,,,,,,
