In [1]:
# initialize
from datamart_isi.entries import Datamart, DatamartSearchResult
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive
from d3m.base import utils as d3m_utils
import os
import pandas as pd

# this part only for logging, you can choose to turn it on or off
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9001/blazegraph/namespace/datamart4/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
# path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
path = "/Users/claire/Documents/ISI/datamart/datamart-userend/examples/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value

In [3]:
all_dataset['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016
0,1,13297,GA,Walton County,1,11385
1,2,13137,GA,Habersham County,6,6500
2,3,54017,WV,Doddridge County,9,1460
3,4,55055,WI,Jefferson County,4,7618
4,5,38065,ND,Oliver County,8,191


In [4]:
"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

No columns can be augment with datamart!


In [5]:
"""
So we need to run wikifier first to get corresponding Q node columns
"""
search_result_wikifier = DatamartSearchResult(search_result={}, supplied_data=None, query_json={}, search_type="wikifier")
wikifiered_result = search_result_wikifier.augment(supplied_data=all_dataset)

No metadata can provide for wikifier augment


Current column: FIPS
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area
[ERROR] No candidate P nodes found for input column : [Area]
Current column: RUCCode
A columns with all numerical values and useless detected, skipped


In [6]:
wikifiered_result['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata
0,1,13297,GA,Walton County,1,11385,Q498312,Q1428
1,2,13137,GA,Habersham County,6,6500,Q501096,Q1428
2,3,54017,WV,Doddridge County,9,1460,Q494081,Q1371
3,4,55055,WI,Jefferson County,4,7618,Q500958,Q1537
4,5,38065,ND,Oliver County,8,191,Q48933,Q1207


In [7]:
"""
Now we can search again to see what happened
run get next page, we will get real search results, it will only have 2 wikidata search results
Because searching on wikidata with large amount of Q nodes, it will take about 3-5 minutes or more to finish
"""
search_res = a.search_with_data(query=None, supplied_data=wikifiered_result)
s1 = search_res.get_next_page()

In [8]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s1:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df

Unnamed: 0,title,columns,join columns,score
0,wikidata search result for FIPS_wikidata,"population, area, inception",FIPS_wikidata,1.0
0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1.0
0,population.csv,FIPS State Area_Name Rural-urban_Continuum Cod...,FIPS_wikidata,0.705378
0,poverty.csv,FIPStxt State Area_Name Rural-urban_Continuum_...,FIPStxt_wikidata,0.705378
0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,FIPStxt_wikidata,0.705378
0,educate.csv,FIPS Code State Area name 2003 Rural-urban Con...,FIPS Code_wikidata,0.705378
0,poverty.csv,FIPStxt State Area_Name Rural-urban_Continuum_...,State_wikidata,0.883883
0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,State_wikidata,0.883883
0,educate.csv,FIPS Code State Area name 2003 Rural-urban Con...,State_wikidata,0.883883
0,population.csv,FIPS State Area_Name Rural-urban_Continuum Cod...,State_wikidata,0.883883


In [9]:
"""
We can search to get next page if you want
"""
s2 = search_res.get_next_page()
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s2:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df

Unnamed: 0,title,columns,join columns,score
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456


In [10]:
# augment with these 2 wikidata search results
aug1 = s1[0].augment(supplied_data=search_res.supplied_data)
aug2 = s1[1].augment(supplied_data=aug1)

In [11]:
# we got couple of new columns (21 columns comparing to original 8 columns)
aug2['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,...,area_for_State_wikidata,elevation above sea level_for_State_wikidata,inception_for_State_wikidata,maximum temperature record_for_State_wikidata,motto text_for_State_wikidata,native label_for_State_wikidata,nickname_for_State_wikidata,population_for_State_wikidata,short name_for_State_wikidata,water as percent of area_for_State_wikidata
1,868,13109,GA,Evans County,6,2500,Q493044,Q1428,484,1914-01-01T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
2,1045,13119,GA,Franklin County,8,4614,Q385931,Q1428,690,1784-02-25T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
3,400,13099,GA,Early County,6,3210,Q486757,Q1428,1337,1818-01-01T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
4,1201,13003,GA,Atkinson County,9,2176,Q488194,Q1428,891,1917-08-15T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
5,310,13071,GA,Colquitt County,6,11252,Q113005,Q1428,1441,1856-01-01T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22


In [12]:
"""
run download function on datamart general search results
You can get the materialized dataset with one extra column "join hints"
This column will give suggestions whether this row can be used to augment the original supplied data
If no row can be joined, this cell will be an empty list
Otherwise it will be a list of numbers which represent the row number of supplied data
"""
download_res = s1[3].download(supplied_data=aug2)
download_res['learningData'].head()

d3m.primitives.data_preprocessing.ensemble_voting.DSBOX: Primitive's Python path does not adhere to d3m.primitives namespace specification. Reason: primitive family segment must match primitive's primitive family.
d3m.primitives.data_preprocessing.ensemble_voting.DSBOX: Primitive's Python path does not adhere to d3m.primitives namespace specification. Reason: must have a known primitive name segment.


Current column: FIPStxt
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area_Name
The best matching P node is P2390
Current column: Rural-urban_Continuum_Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Rural-urban_Continuum_Code_2013
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2013
A columns with all numerical values and useless detected, skipped
Current column: POVALL_2017
Current column: CI90LBAll_2017
Current column: CI90UBALL_2017
Current column: PCTPOVALL_2017
A columns with all numerical values and useless detected, skipped
Current column: CI90LBALLP_2017
A columns with all numerical values and useless detected, skipped
Current column: CI90UBALLP_2017
A columns with all numerical values and useless detected, skipped


Unnamed: 0,FIPStxt,State,Area_Name,Rural-urban_Continuum_Code_2003,Urban_Influence_Code_2003,Rural-urban_Continuum_Code_2013,Urban_Influence_Code_2013,POVALL_2017,CI90LBAll_2017,CI90UBALL_2017,...,CI90UBINC_2017,POV04_2017,CI90LB04_2017,CI90UB04_2017,PCTPOV04_2017,CI90LB04P_2017,CI90UB04P_2017,FIPStxt_wikidata,State_wikidata,joining_pairs
0,0,US,United States,,,,,,,,...,60422,3932969.0,3880645.0,3985293.0,20.2,19.9,20.5,,,[]
1,1000,AL,Alabama,,,,,,,,...,48935,78986.0,75009.0,82963.0,27.7,26.3,29.1,,Q173,[]
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,,,,...,64565,,,,,,,Q156168,Q173,[]
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,,,,...,60775,,,,,,,Q156163,Q173,[939]
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,,,,...,35762,,,,,,,Q109437,Q173,[942]


In [13]:
# run augment is same as previous, finally it becomes to be a 55 columns dataframe
aug3 = s1[3].augment(supplied_data=aug2)
aug3['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,...,PCTPOV517_2017,PCTPOVALL_2017,POV017_2017,POV04_2017,POV517_2017,POVALL_2017,Rural-urban_Continuum_Code_2003,Rural-urban_Continuum_Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013
1,868,13109,GA,Evans County,6,2500,Q493044,Q1428,484,1914-01-01T00:00:00Z,...,38.4,28.0,,,799.0,,6.0,6.0,7.0,6.0
2,1045,13119,GA,Franklin County,8,4614,Q385931,Q1428,690,1784-02-25T00:00:00Z,...,24.7,18.7,,,885.0,,8.0,8.0,7.0,6.0
3,400,13099,GA,Early County,6,3210,Q486757,Q1428,1337,1818-01-01T00:00:00Z,...,37.7,26.7,,,726.0,,6.0,6.0,6.0,6.0
4,1201,13003,GA,Atkinson County,9,2176,Q488194,Q1428,891,1917-08-15T00:00:00Z,...,35.7,24.2,819.0,,572.0,,9.0,9.0,8.0,10.0
5,310,13071,GA,Colquitt County,6,11252,Q113005,Q1428,1441,1856-01-01T00:00:00Z,...,32.3,25.6,,,,,6.0,6.0,5.0,5.0
