In [1]:
# initialize
from datamart_isi.entries import Datamart, DatamartSearchResult
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.denormalize import Hyperparams as hyper_denormalize, DenormalizePrimitive
from d3m.base import utils as d3m_utils
import os
import pandas as pd

# this part only for logging, you can choose to turn it on or off
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
# load the ISI datamart, currently the url is here, may change in the future
isi_datamart_url = "http://dsbox02.isi.edu:9001/blazegraph/namespace/datamart3/sparql"
a = Datamart(connection_url=isi_datamart_url)
# load the D3M dataset,here we use "DA_poverty_estimation" as exmaple ,please change to your dataset path
loader = D3MDatasetLoader()
# path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
path = "/Users/claire/Documents/ISI/datamart/datamart-userend/examples/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
all_dataset = loader.load(dataset_uri=all_dataset_uri)
# run denormlaize primitive
denormalize_hyperparams = hyper_denormalize.defaults()
denormalize_primitive = DenormalizePrimitive(hyperparams = denormalize_hyperparams)
all_dataset = denormalize_primitive.produce(inputs = all_dataset).value

In [3]:
all_dataset['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016
0,1,13297,GA,Walton County,1,11385
1,2,13137,GA,Habersham County,6,6500
2,3,54017,WV,Doddridge County,9,1460
3,4,55055,WI,Jefferson County,4,7618
4,5,38065,ND,Oliver County,8,191


In [4]:
"""
start search, run search with data function.
Here because the dataset do not have any "Text" semantic type columns,
the system will said that no columns can be augment
"""
search_res = a.search_with_data(query=None, supplied_data=all_dataset)

No columns can be augment with datamart!


In [5]:
"""
So we need to run wikifier first to get corresponding Q node columns
"""
search_result_wikifier = DatamartSearchResult(search_result={}, supplied_data=None, query_json={}, search_type="wikifier")
wikifiered_result = search_result_wikifier.augment(supplied_data=all_dataset)

No metadata can provide for wikifier augment


Current column: FIPS
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area
[ERROR] No candidate P nodes found for input column : [Area]
Current column: RUCCode
A columns with all numerical values and useless detected, skipped


In [6]:
wikifiered_result['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata
0,1,13297,GA,Walton County,1,11385,Q498312,Q1428
1,2,13137,GA,Habersham County,6,6500,Q501096,Q1428
2,3,54017,WV,Doddridge County,9,1460,Q494081,Q1371
3,4,55055,WI,Jefferson County,4,7618,Q500958,Q1537
4,5,38065,ND,Oliver County,8,191,Q48933,Q1207


In [7]:
"""
Now we can search again to see what happened
run get next page, we will get real search results, it will only have 2 wikidata search results
Because searching on wikidata with large amount of Q nodes, it will take about 3-5 minutes or more to finish
"""
search_res = a.search_with_data(query=None, supplied_data=wikifiered_result)
s1 = search_res.get_next_page()

In [8]:
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
for each in s1:
    col1 = each.display()
    output_df = output_df.append(col1)
output_df

Unnamed: 0,title,columns,join columns,score
0,wikidata search result for FIPS_wikidata,"population, area, inception",FIPS_wikidata,1.0
0,wikidata search result for State_wikidata,"population, nickname, motto text, demonym, nat...",State_wikidata,1.0
0,Unemployment.csv,"FIPStxt, State, Area_name, Rural_urban_continu...",FIPStxt_wikidata,0.705378
0,educate.csv,"FIPS Code, State, Area name, 2003 Rural-urban ...",FIPS Code_wikidata,0.705378
0,population.csv,"FIPS, State, Area_Name, Rural-urban_Continuum ...",FIPS_wikidata,0.705378
0,poverty.csv,"FIPStxt, State, Area_Name, Rural-urban_Continu...",FIPStxt_wikidata,0.705378
0,Most-Recent-Cohorts-Scorecard-Elements.csv,"UNITID, OPEID, OPEID6, INSTNM, CITY, STABBR, I...",STABBR_wikidata,0.883883
0,Unemployment.csv,"FIPStxt, State, Area_name, Rural_urban_continu...",State_wikidata,0.883883
0,educate.csv,"FIPS Code, State, Area name, 2003 Rural-urban ...",State_wikidata,0.883883
0,population.csv,"FIPS, State, Area_Name, Rural-urban_Continuum ...",State_wikidata,0.883883


In [9]:
"""
We can search to get next page if you want
"""
s2 = search_res.get_next_page()
# show the search results details
# wikidata has no dynamic score
output_df = pd.DataFrame()
if s2 != None:
    for each in s2:
        col1 = each.display()
        output_df = output_df.append(col1)
    output_df

In [10]:
# augment with these 2 wikidata search results
aug1 = s1[0].augment(supplied_data=search_res.supplied_data)
aug2 = s1[1].augment(supplied_data=aug1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
# we got couple of new columns (21 columns comparing to original 8 columns)
aug2['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,...,area_for_State_wikidata,elevation above sea level_for_State_wikidata,inception_for_State_wikidata,maximum temperature record_for_State_wikidata,motto text_for_State_wikidata,native label_for_State_wikidata,nickname_for_State_wikidata,population_for_State_wikidata,short name_for_State_wikidata,water as percent of area_for_State_wikidata
0,1570,13309,GA,Wheeler County,9,2111,Q498332,Q1428,,1912-08-14T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
1,143,13165,GA,Jenkins County,6,2606,Q389551,Q1428,913.0,1905-08-17T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
2,3012,13245,GA,Richmond County,2,48929,Q498319,Q1428,851.0,1777-02-05T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
3,1045,13119,GA,Franklin County,8,4614,Q385931,Q1428,690.0,1784-02-25T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22
4,1349,13269,GA,Taylor County,8,2004,Q505299,Q1428,983.0,1852-01-15T00:00:00Z,...,153909,180,1788-01-02T00:00:00Z,112,"Wisdom, Justice, Moderation",State of Georgia,,10214860,GA,3.22


In [12]:
"""
run download function on datamart general search results
You can get the materialized dataset with one extra column "join hints"
This column will give suggestions whether this row can be used to augment the original supplied data
If no row can be joined, this cell will be an empty list
Otherwise it will be a list of numbers which represent the row number of supplied data
"""
download_res = s1[3].download(supplied_data=aug2)
download_res['learningData'].head()

d3m.primitives.data_preprocessing.ensemble_voting.DSBOX: Primitive's Python path does not adhere to d3m.primitives namespace specification. Reason: primitive family segment must match primitive's primitive family.
d3m.primitives.data_preprocessing.ensemble_voting.DSBOX: Primitive's Python path does not adhere to d3m.primitives namespace specification. Reason: must have a known primitive name segment.


Current column: FIPS Code
The best matching P node is P882
Current column: State
The best matching P node is P5086
Current column: Area name
The best matching P node is P2390
Current column: 2003 Rural-urban Continuum Code
A columns with all numerical values and useless detected, skipped
Current column: 2003 Urban Influence Code
A columns with all numerical values and useless detected, skipped
Current column: 2013 Rural-urban Continuum Code
A columns with all numerical values and useless detected, skipped
Current column: 2013 Urban Influence Code
A columns with all numerical values and useless detected, skipped
Current column: Less than a high school diploma, 1970
The best matching P node is P3060
Current column: High school diploma only, 1970
The best matching P node is P3060
Current column: Some college (1-3 years), 1970
The best matching P node is P3060
Current column: Four years of college or higher, 1970
The best matching P node is P3060
Current column: Percent of adults with less

Unnamed: 0,FIPS Code,State,Area name,2003 Rural-urban Continuum Code,2003 Urban Influence Code,2013 Rural-urban Continuum Code,2013 Urban Influence Code,"Less than a high school diploma, 1970","High school diploma only, 1970","Some college (1-3 years), 1970",...,"High school diploma only, 2013-17","Some college or associate's degree, 2013-17","Bachelor's degree or higher, 2013-17","Percent of adults with less than a high school diploma, 2013-17","Percent of adults with a high school diploma only, 2013-17","Percent of adults completing some college or associate's degree, 2013-17","Percent of adults with a bachelor's degree or higher, 2013-17",FIPS Code_wikidata,State_wikidata,joining_pairs
0,0,US,United States,,,,,52373312.0,34158051.0,11650730.0,...,59093612.0,62853315.0,66887603.0,12.7,27.3,29.1,30.9,,,[]
1,1000,AL,Alabama,,,,,1062306.0,468269.0,136287.0,...,1012551.0,979449.0,803578.0,14.7,30.9,29.9,24.5,,Q173,[]
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,6611.0,3757.0,933.0,...,12363.0,10697.0,9176.0,12.3,33.6,29.1,25.0,Q156168,Q173,[]
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,18726.0,8426.0,2334.0,...,39771.0,45286.0,43968.0,9.8,27.8,31.7,30.7,Q156163,Q173,[921]
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,8120.0,2242.0,581.0,...,6549.0,4707.0,2218.0,26.9,35.5,25.5,12.0,Q109437,Q173,[947]


In [13]:
# run augment is same as previous, finally it becomes to be a 55 columns dataframe
aug3 = s1[3].augment(supplied_data=aug2)
aug3['learningData'].head()

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,State_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,...,"Percent of adults with less than a high school diploma, 1970","Percent of adults with less than a high school diploma, 1980","Percent of adults with less than a high school diploma, 1990","Percent of adults with less than a high school diploma, 2000","Percent of adults with less than a high school diploma, 2013-17","Some college (1-3 years), 1970","Some college (1-3 years), 1980","Some college or associate's degree, 1990","Some college or associate's degree, 2000","Some college or associate's degree, 2013-17"
1,1570,13309,GA,Wheeler County,9,2111,Q498332,Q1428,,1912-08-14T00:00:00Z,...,83.2,63.2,43.3,32.1,23.1,121.0,240.0,444.0,783.0,1033.0
2,143,13165,GA,Jenkins County,6,2606,Q389551,Q1428,913.0,1905-08-17T00:00:00Z,...,81.3,64.8,50.1,38.0,21.3,185.0,434.0,658.0,979.0,1635.0
3,3012,13245,GA,Richmond County,2,48929,Q498319,Q1428,851.0,1777-02-05T00:00:00Z,...,51.9,38.6,29.1,22.0,17.0,7753.0,14141.0,27079.0,36349.0,40700.0
4,1045,13119,GA,Franklin County,8,4614,Q385931,Q1428,690.0,1784-02-25T00:00:00Z,...,74.9,62.1,45.9,33.0,22.7,445.0,821.0,1754.0,2585.0,3871.0
5,1349,13269,GA,Taylor County,8,2004,Q505299,Q1428,983.0,1852-01-15T00:00:00Z,...,80.2,64.0,48.8,36.4,23.2,200.0,317.0,603.0,901.0,1690.0
