In [1]:
# initialize
from datamart.entries import Datamart
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive
from d3m.base import utils as d3m_utils
import os
import pandas as pd

In [2]:
# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9999/blazegraph/namespace/datamart3/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value


In [3]:
all_dataset['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016
0,1,13297,GA,Walton County,1,11385
1,2,13137,GA,Habersham County,6,6500
2,3,54017,WV,Doddridge County,9,1460
3,4,55055,WI,Jefferson County,4,7618
4,5,38065,ND,Oliver County,8,191


In [4]:
"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

No columns can be augment with datamart!


[INFO] No Q nodes columns found in input data, will run wikifier.


In [5]:
"""
run get next page, we will get real search results, it will only have 2 wikidata search results
Explain:
here we do not find any "Qnodes" semantic type columns, so we will try to run wikifier before searching in wikidata database
Then, We will generate 2 Q nodes columns for FIPS and State. 
These 2 columns can be used to search in wikidata database
Because searching on wikidata with large amount of Q nodes, it will take about 3 minutes or more to finish
"""
s1 = search_res.get_next_page()

d3m.primitives.data_cleaning.cleaning_featurizer.DSBOX: Primitive is not providing a description through its docstring.
d3m.primitives.data_cleaning.column_fold.DSBOX: Primitive is not providing a description through its docstring.


warn from part 4 d3m.primitives.data_cleaning.labeler.DSBOX
{1, 2, 3, 4}
Current column: FIPS
The best matching P node is P5736
Current column: State
The best matching P node is P5086
Current column: Area
[ERROR] No candidate P nodes found for input column : [Area]
Current column: RUCCode
A columns with all numerical values and useless detected, skipped
Wikidata Q nodes inputs detected! Will search with it.
Totally 2 Q nodes columns detected!


In [6]:
# because no more serach results found, s2 will be None
s2 = search_res.get_next_page()

In [7]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s1:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df

Unnamed: 0,title,columns,join columns,score
0,wikidata search result for FIPS_wikidata,"orbital eccentricity, absolute magnitude, orbi...",FIPS_wikidata,1
0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1


In [8]:
# augment with these 2 wikidata search results
aug1 = s1[0].augment(supplied_data=search_res.supplied_data)
aug2 = s1[1].augment(supplied_data=aug1)

In [9]:
# reutrned aug2 is a d3m.container.dataset
res_id, aug2_dataframe = d3m_utils.get_tabular_resource(dataset=aug2, resource_id=None)
aug2_dataframe.head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,absolute magnitude,apoapsis,...,semi-major axis,area,elevation above sea level,inception,motto text,native label,nickname,population,short name,water as percent of area
0,143,13165,GA,Jenkins County,6,2606,Q4541201,Q1428,14.0,2.656,...,2.2341322,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
1,1046,13257,GA,Stephens County,7,5212,Q1480402,Q1428,13.7,3.003,...,2.6870308,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
2,713,13069,GA,Coffee County,7,9153,Q259449,Q1428,14.2,2.961,...,2.3721852,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
3,3038,13225,GA,Peach County,6,5522,Q2624194,Q1428,13.8,3.109,...,2.9185144,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
4,6,13059,GA,Clarke County,3,31950,Q1084177,Q1428,13.5,2.833,...,2.5879621,153909,180,1788-01-02T00:00:00Z,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22


In [10]:
"""
After first time search, now we have more columns(including the Q nodes), we can search again to get more results
Also, because this time we already have Q nodes in supplied data, we will skip wikidata part's search
"""
search_res2 = a.search_with_data(query=None, supplied_data=aug2)

[INFO] Q nodes columns found in input data, will not run wikifier.


In [11]:
# run get next page twice
s3 = search_res2.get_next_page()
s4 = search_res2.get_next_page()

Wikidata Q nodes inputs detected! Will search with it.
Totally 2 Q nodes columns detected!


In [20]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s3:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df = output_df.reset_index()

Unnamed: 0,index,title,columns,join columns,score
0,0,wikidata search result for FIPS_wikidata,"orbital eccentricity, absolute magnitude, orbi...",FIPS_wikidata,1.0
1,0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1.0
2,0,Most-Recent-Cohorts-Scorecard-Elements.csv,UNITID OPEID OPEID6 INSTNM CITY STABBR INSTURL...,OPEID6_wikidata,0.010772
3,0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,FIPStxt_wikidata,0.220971
4,0,population.csv,FIPS State Area_Name Rural-urban_Continuum Cod...,FIPS_wikidata,0.220971
5,0,poverty.csv,FIPStxt State Area_Name Rural-urban_Continuum_...,FIPStxt_wikidata,0.220971
6,0,educate.csv,FIPS Code State Area name 2003 Rural-urban Con...,FIPS Code_wikidata,0.220971
7,0,List_of_United_States_counties_by_per_capita_i...,Rank County-area State Per capita income Media...,State_wikidata,0.883883
8,0,Most-Recent-Cohorts-Scorecard-Elements.csv,UNITID OPEID OPEID6 INSTNM CITY STABBR INSTURL...,STABBR_wikidata,0.883883
9,0,Unemployment.csv,FIPStxt State Area_name Rural_urban_continuum_...,State_wikidata,0.883883


In [21]:
# run download function one search result
download_res = s3[4].download(supplied_data=search_res.supplied_data)

Current column: FIPS
The best matching P node is P5736
Current column: State
The best matching P node is P5086
Current column: Area_Name
The best matching P node is P3984
Current column: Rural-urban_Continuum Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Rural-urban_Continuum Code_2013
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2003
A columns with all numerical values and useless detected, skipped
Current column: Urban_Influence_Code_2013
A columns with all numerical values and useless detected, skipped
Current column: Economic_typology_2015
A columns with all numerical values and useless detected, skipped
Current column: CENSUS_2010_POP
The best matching P node is P698
Current column: ESTIMATES_BASE_2010
The best matching P node is P698
Current column: POP_ESTIMATE_2010
The best matching P node is P698
Current column: POP_ESTIMATE_2011
The best matching P node is P698
Current colu

The best matching P node is P3060
Current column: NET_MIG_2010
The best matching P node is P3060
Current column: NET_MIG_2011
The best matching P node is P3060
Current column: NET_MIG_2012
The best matching P node is P3060
Current column: NET_MIG_2013
The best matching P node is P3060
Current column: NET_MIG_2014
The best matching P node is P3060
Current column: NET_MIG_2015
The best matching P node is P3060
Current column: NET_MIG_2016
The best matching P node is P3060
Current column: NET_MIG_2017
The best matching P node is P3060
Current column: RESIDUAL_2010
The best matching P node is P3060
Current column: RESIDUAL_2011
The best matching P node is P3060
Current column: RESIDUAL_2012
The best matching P node is P3060
Current column: RESIDUAL_2013
The best matching P node is P3060
Current column: RESIDUAL_2014
The best matching P node is P3060
Current column: RESIDUAL_2015
The best matching P node is P3060
Current column: RESIDUAL_2016
The best matching P node is P3743
Current column

The best matching P node is P305
Current column: R_NET_MIG_2012
The best matching P node is P305
Current column: R_NET_MIG_2013
The best matching P node is P305
Current column: R_NET_MIG_2014
The best matching P node is P305
Current column: R_NET_MIG_2015
The best matching P node is P305
Current column: R_NET_MIG_2016
The best matching P node is P305
Current column: R_NET_MIG_2017
The best matching P node is P305
 - start getting pairs for ([6], [133])


AttributeError: 'Dataset' object has no attribute 'head'

In [27]:
# run augment function on one search result
aug_res = s3[3].augment(supplied_data=search_res.supplied_data)

[INFO] Find downloaded data from previous time, will use that.
 - start getting pairs for ([6], [52])


In [25]:
download_res['augmentData']

Unnamed: 0,FIPS,State,Area_Name,Rural-urban_Continuum Code_2003,Rural-urban_Continuum Code_2013,Urban_Influence_Code_2003,Urban_Influence_Code_2013,Economic_typology_2015,CENSUS_2010_POP,ESTIMATES_BASE_2010,...,ESTIMATES_BASE_2010_wikidata,POP_ESTIMATE_2010_wikidata,POP_ESTIMATE_2011_wikidata,POP_ESTIMATE_2012_wikidata,POP_ESTIMATE_2013_wikidata,POP_ESTIMATE_2014_wikidata,POP_ESTIMATE_2015_wikidata,POP_ESTIMATE_2016_wikidata,POP_ESTIMATE_2017_wikidata,joining_pairs
0,0,US,United States,,,,,,308745538,308758105,...,,,,,,,,,,[]
1,1000,AL,Alabama,,,,,,4779736,4780135,...,Q53946605,,Q47905025,,Q54548208,,Q36532960,Q39270477,Q52465184,[]
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,0.0,54571,54571,...,Q39362039,,,Q39364001,Q39362736,Q39363661,,Q39365771,Q39367019,[]
3,1003,AL,Baldwin County,4.0,3.0,5.0,2.0,5.0,182265,182265,...,Q40006304,Q40010225,Q40027597,Q40048846,,Q40103471,Q40125383,Q40151240,Q40179752,[1898]
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,3.0,27457,27457,...,Q34445080,Q39200018,Q39200101,Q39199189,Q39198004,Q39196969,Q39194280,Q39191480,Q39188661,[1107]
5,1007,AL,Bibb County,1.0,1.0,1.0,1.0,0.0,22915,22919,...,Q39177246,Q39177034,Q39176409,Q39176030,Q39175348,Q39175474,,Q39175915,Q39176067,[1464]
6,1009,AL,Blount County,1.0,1.0,1.0,1.0,0.0,57322,57324,...,Q39376858,Q39377211,Q39378257,Q39378471,Q35779937,,Q39378444,Q39378257,,[]
7,1011,AL,Bullock County,6.0,6.0,6.0,6.0,3.0,10914,10911,...,Q39120872,Q39120730,Q39119681,Q33421350,Q39119056,Q39119674,Q39118369,Q33870360,Q36198012,[]
8,1013,AL,Butler County,6.0,6.0,6.0,6.0,0.0,20947,20946,...,Q39167865,Q28316820,Q39167561,Q39166689,Q39164954,Q28316797,Q39163770,Q39162874,Q39162182,[1303]
9,1015,AL,Calhoun County,3.0,3.0,2.0,2.0,4.0,118572,118586,...,Q39706733,Q33987987,Q39703204,Q39700768,Q39697262,Q39694778,Q39692919,Q39690677,Q39689625,[441]


In [28]:
aug_res['augmentData']

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,Civilian_labor_force_2007,Civilian_labor_force_2008,...,Unemployment_rate_2009,Unemployment_rate_2010,Unemployment_rate_2011,Unemployment_rate_2012,Unemployment_rate_2013,Unemployment_rate_2014,Unemployment_rate_2015,Unemployment_rate_2016,Unemployment_rate_2017,Urban_influence_code_2013
0,1,13297,GA,Walton County,1,11385,Q19625215,Q1428,41084,42256,...,10.4,10.6,10.4,9.1,7.8,6.4,5.0,4.9,4.3,1.0
1,2,13137,GA,Habersham County,6,6500,Q4541198,Q1428,20218,20854,...,10.0,10.7,10.5,9.5,8.0,6.9,6.0,5.3,4.6,5.0
2,3,54017,WV,Doddridge County,9,1460,Q15166045,Q1371,2823,2816,...,7.8,7.1,7.1,6.3,5.3,5.0,6.0,5.2,4.2,8.0
3,4,55055,WI,Jefferson County,4,7618,Q15167186,Q1537,42244,42410,...,9.2,8.7,7.6,6.7,6.3,5.0,4.0,3.9,3.2,3.0
4,5,38065,ND,Oliver County,8,191,Q7471453,Q1207,1131,1148,...,4.0,5.7,6.2,6.7,6.1,6.3,6.0,5.4,4.2,2.0
5,6,13059,GA,Clarke County,3,31950,Q1084177,Q1428,64398,66162,...,7.6,9.3,9.3,8.7,7.8,6.9,6.0,5.5,4.8,2.0
6,8,21109,KY,Jackson County,9,4213,Q147626,Q1603,4340,4103,...,16.4,14.1,13.6,11.8,11.6,11.5,9.0,7.7,8.3,10.0
7,9,54003,WV,Berkeley County,3,14461,Q15166029,Q1371,45993,46459,...,8.5,8.6,7.7,6.7,5.6,5.5,5.0,4.0,3.7,2.0
8,10,22079,LA,Rapides Parish,3,25575,Q2613686,Q1588,59862,62250,...,6.4,7.6,7.7,7.0,6.9,6.7,7.0,6.1,5.6,2.0
9,11,20037,KS,Crawford County,4,8318,Q582597,Q1558,20216,19905,...,8.5,8.7,7.9,6.6,5.9,5.4,5.0,4.8,4.2,5.0
