In [1]:
# initialize
from datamart_isi.entries import Datamart
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive
from d3m.base import utils as d3m_utils
import os
import pandas as pd

In [2]:
# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value


In [3]:
all_dataset['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016
0,1,13297,GA,Walton County,1,11385
1,2,13137,GA,Habersham County,6,6500
2,3,54017,WV,Doddridge County,9,1460
3,4,55055,WI,Jefferson County,4,7618
4,5,38065,ND,Oliver County,8,191


In [4]:
"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

No columns can be augment with datamart!


[INFO] No Q nodes columns found in input data, will run wikifier.


In [5]:
"""
run get next page, we will get real search results, it will only have 2 wikidata search results
Explain:
here we do not find any "Qnodes" semantic type columns, so we will try to run wikifier before searching in wikidata database
Then, We will generate 2 Q nodes columns for FIPS and State. 
These 2 columns can be used to search in wikidata database
Because searching on wikidata with large amount of Q nodes, it will take about 3 minutes or more to finish
"""
s1 = search_res.get_next_page()

d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX: Primitive is not providing a description through its docstring.
d3m.primitives.data_cleaning.column_fold.DSBOX: Primitive is not providing a description through its docstring.


warn from part 4 d3m.primitives.data_cleaning.labeler.DSBOX
{1, 2, 3, 4}
Current column: FIPS
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area
[ERROR] No candidate P nodes found for input column : [Area]
Current column: RUCCode
A columns with all numerical values and useless detected, skipped
Wikidata Q nodes inputs detected! Will search with it.
Totally 2 Q nodes columns detected!


In [6]:
# because no more serach results found, s2 will be None
s2 = search_res.get_next_page()

In [7]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s1:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df

Unnamed: 0,title,columns,join columns,score
0,wikidata search result for FIPS_wikidata,"population, area, inception",FIPS_wikidata,1
0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1


In [8]:
# augment with these 2 wikidata search results
aug1 = s1[0].augment(supplied_data=search_res.supplied_data)
aug2 = s1[1].augment(supplied_data=aug1)

In [9]:
# reutrned aug2 is a d3m.container.dataset
res_id, aug2_dataframe = d3m_utils.get_tabular_resource(dataset=aug2, resource_id=None)
aug2_dataframe.head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,population_for_FIPS_wikidata,area_for_State_wikidata,elevation above sea level_for_State_wikidata,inception_for_State_wikidata,motto text_for_State_wikidata,native label_for_State_wikidata,nickname_for_State_wikidata,population_for_State_wikidata,short name_for_State_wikidata,water as percent of area_for_State_wikidata
0,1570,13309,GA,Wheeler County,9,2111,Q498332,Q1428,,7909,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
1,1065,13301,GA,Warren County,8,1511,Q491529,Q1428,287.0,5558,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
2,2328,13185,GA,Lowndes County,3,23936,Q493134,Q1428,1323.0,112916,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
3,2595,13029,GA,Bryan County,2,3456,Q486848,Q1428,1177.0,33157,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
4,2820,13177,GA,Lee County,3,3190,Q491508,Q1428,938.0,29071,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22


In [10]:
aug2_dataframe

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,population_for_FIPS_wikidata,area_for_State_wikidata,elevation above sea level_for_State_wikidata,inception_for_State_wikidata,motto text_for_State_wikidata,native label_for_State_wikidata,nickname_for_State_wikidata,population_for_State_wikidata,short name_for_State_wikidata,water as percent of area_for_State_wikidata
0,1570,13309,GA,Wheeler County,9,2111,Q498332,Q1428,,7909,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
1,1065,13301,GA,Warren County,8,1511,Q491529,Q1428,287,5558,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
2,2328,13185,GA,Lowndes County,3,23936,Q493134,Q1428,1323,112916,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
3,2595,13029,GA,Bryan County,2,3456,Q486848,Q1428,1177,33157,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
4,2820,13177,GA,Lee County,3,3190,Q491508,Q1428,938,29071,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
5,939,13199,GA,Meriwether County,1,4417,Q501151,Q1428,1309,21232,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
6,1445,13009,GA,Baldwin County,4,10273,Q488206,Q1428,693,46039,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
7,548,13311,GA,White County,8,4030,Q389365,Q1428,242,27797,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
8,6,13059,GA,Clarke County,3,31950,Q112061,Q1428,314,121265,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
9,2856,13045,GA,Carroll County,1,16713,Q493088,Q1428,1305,112355,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22


In [11]:
"""
After first time search, now we have more columns(including the Q nodes), we can search again to get more results
Also, because this time we already have Q nodes in supplied data, we will skip wikidata part's search
"""
search_res2 = a.search_with_data(query=None, supplied_data=aug2)

[INFO] Q nodes columns found in input data, will not run wikifier.


In [12]:
# run get next page twice
s3 = search_res2.get_next_page()
s4 = search_res2.get_next_page()

Wikidata Q nodes inputs detected! Will search with it.
Totally 2 Q nodes columns detected!


In [20]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s3:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df = output_df.reset_index()
output_df

Unnamed: 0,index,title,columns,join columns,score
0,0,wikidata search result for FIPS_wikidata,"population, area, inception",FIPS_wikidata,1.0
1,0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1.0
2,0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,FIPStxt_wikidata,0.220971
3,0,educate.csv,FIPS Code State Area name 2003 Rural-urban Con...,FIPS Code_wikidata,0.220971
4,0,poverty.csv,FIPStxt State Area_Name Rural-urban_Continuum_...,FIPStxt_wikidata,0.220971
5,0,population.csv,FIPS State Area_Name Rural-urban_Continuum Cod...,FIPS_wikidata,0.220971
6,0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.331456
7,0,poverty.csv,FIPStxt State Area_Name Rural-urban_Continuum_...,State_wikidata,0.883883
8,0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,State_wikidata,0.883883
9,0,educate.csv,FIPS Code State Area name 2003 Rural-urban Con...,State_wikidata,0.883883


In [14]:
# run download function one search result
download_res = s3[4].download(supplied_data=search_res.supplied_data)

Current column: FIPStxt
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area_Name
The best matching P node is P2390
Current column: Rural-urban_Continuum_Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Rural-urban_Continuum_Code_2013
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2013
A columns with all numerical values and useless detected, skipped
Current column: POVALL_2017
Current column: CI90LBAll_2017
Current column: CI90UBALL_2017
Current column: PCTPOVALL_2017
A columns with all numerical values and useless detected, skipped
Current column: CI90LBALLP_2017
A columns with all numerical values and useless detected, skipped
Current column: CI90UBALLP_2017
A columns with all numerical values and useless detected, skipped


In [15]:
# run augment function on one search result
aug_res = s3[3].augment(supplied_data=search_res.supplied_data)

Current column: FIPS Code
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area name
The best matching P node is P2390
Current column: 2003 Rural-urban Continuum Code
A columns with all numerical values and useless detected, skipped
Current column: 2003 Urban Influence Code
A columns with all numerical values and useless detected, skipped
Current column: 2013 Rural-urban Continuum Code
A columns with all numerical values and useless detected, skipped
Current column: 2013 Urban Influence Code
A columns with all numerical values and useless detected, skipped
Current column: Less than a high school diploma, 1970
The best matching P node is P3060
Current column: High school diploma only, 1970
The best matching P node is P3060
Current column: Some college (1-3 years), 1970
The best matching P node is P3060
Current column: Four years of college or higher, 1970
The best matching P node is P3060
Current column: Percent of adults with less

In [16]:
download_res['augmentData'].head()

Unnamed: 0,FIPStxt,State,Area_Name,Rural-urban_Continuum_Code_2003,Urban_Influence_Code_2003,Rural-urban_Continuum_Code_2013,Urban_Influence_Code_2013,POVALL_2017,CI90LBAll_2017,CI90UBALL_2017,...,CI90UBINC_2017,POV04_2017,CI90LB04_2017,CI90UB04_2017,PCTPOV04_2017,CI90LB04P_2017,CI90UB04P_2017,FIPStxt_wikidata,State_wikidata,joining_pairs
0,0,US,United States,,,,,,,,...,60422,3932969.0,3880645.0,3985293.0,20.2,19.9,20.5,,,[]
1,1000,AL,Alabama,,,,,,,,...,48935,78986.0,75009.0,82963.0,27.7,26.3,29.1,,Q173,[]
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,,,,...,64565,,,,,,,Q156168,Q173,[]
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,,,,...,60775,,,,,,,Q156163,Q173,[1898]
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,,,,...,35762,,,,,,,Q109437,Q173,[1107]


In [17]:
aug_res['augmentData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,2003 Rural-urban Continuum Code,2003 Urban Influence Code,...,"Percent of adults with less than a high school diploma, 1970","Percent of adults with less than a high school diploma, 1980","Percent of adults with less than a high school diploma, 1990","Percent of adults with less than a high school diploma, 2000","Percent of adults with less than a high school diploma, 2013-17","Some college (1-3 years), 1970","Some college (1-3 years), 1980","Some college or associate's degree, 1990","Some college or associate's degree, 2000","Some college or associate's degree, 2013-17"
0,1,13297,GA,Walton County,1,11385,Q498312,Q1428,1.0,1.0,...,74.7,62.0,42.1,26.5,13.2,493.0,1238.0,4028.0,9564.0,17334.0
1,2,13137,GA,Habersham County,6,6500,Q501096,Q1428,6.0,5.0,...,67.7,56.5,41.0,29.1,22.0,733.0,1354.0,2958.0,5006.0,7621.0
2,3,54017,WV,Doddridge County,9,1460,Q494081,Q1371,9.0,8.0,...,69.6,48.2,35.4,30.6,19.8,143.0,331.0,659.0,878.0,1385.0
3,4,55055,WI,Jefferson County,4,7618,Q500958,Q1537,4.0,3.0,...,48.0,32.3,23.0,15.3,8.6,2996.0,4961.0,9318.0,14135.0,19174.0
4,5,38065,ND,Oliver County,8,191,Q48933,Q1207,8.0,7.0,...,71.6,45.6,31.8,20.1,11.3,88.0,234.0,357.0,457.0,550.0


In [18]:
import pickle
temp = pickle.dumps(s3[4])
# this one is used for primitive
temp

b'\x80\x03cdatamart_isi.entries\nDatamartSearchResult\nq\x00)\x81q\x01}q\x02(X\r\x00\x00\x00search_resultq\x03}q\x04(X\x05\x00\x00\x00scoreq\x05}q\x06(X\x08\x00\x00\x00datatypeq\x07X\'\x00\x00\x00http://www.w3.org/2001/XMLSchema#doubleq\x08X\x04\x00\x00\x00typeq\tX\x07\x00\x00\x00literalq\nX\x05\x00\x00\x00valueq\x0bX\x13\x00\x00\x000.22097086912079575q\x0cuX\x04\x00\x00\x00rankq\r}q\x0e(h\x07X$\x00\x00\x00http://www.w3.org/2001/XMLSchema#intq\x0fh\tX\x07\x00\x00\x00literalq\x10h\x0bX\x01\x00\x00\x002q\x11uX\x08\x00\x00\x00variableq\x12}q\x13(h\tX\x03\x00\x00\x00uriq\x14h\x0bXV\x00\x00\x00http://www.wikidata.org/entity/statement/D1000006-d547afad-3309-46b4-990d-946df3efbabdq\x15uX\x07\x00\x00\x00datasetq\x16}q\x17(h\tX\x03\x00\x00\x00uriq\x18h\x0bX\'\x00\x00\x00http://www.wikidata.org/entity/D1000006q\x19uX\x03\x00\x00\x00urlq\x1a}q\x1b(h\tX\x03\x00\x00\x00uriq\x1ch\x0bXa\x00\x00\x00https://raw.githubusercontent.com/usc-isi-i2/datamart-userend/master/example_datasets/poverty.csvq\x1duX

In [19]:
s3[3].search_type

'general'