In [1]:
import os
import datamart
import json

from datamart_isi import rest
from d3m.container.dataset import Dataset, D3MDatasetLoader
from common_primitives.datamart_augment import Hyperparams as hyper_augment, DataMartAugmentPrimitive
from common_primitives.datamart_download import Hyperparams as hyper_download, DataMartDownloadPrimitive

import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [2]:
# load your dataset here
loader = D3MDatasetLoader()
path = "/Users/minazuki/Desktop/studies/master/2018Summer/data/datasets/seed_datasets_data_augmentation/DA_poverty_estimation/TRAIN/dataset_TRAIN/datasetDoc.json"
json_file = os.path.abspath(path)
all_dataset_uri = 'file://{}'.format(json_file)
input_dataset = loader.load(dataset_uri=all_dataset_uri)


In [3]:
# original dataset looks like:
input_dataset['learningData']

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016
0,1,13297,GA,Walton County,1,11385
1,2,13137,GA,Habersham County,6,6500
2,3,54017,WV,Doddridge County,9,1460
3,4,55055,WI,Jefferson County,4,7618
4,5,38065,ND,Oliver County,8,191
...,...,...,...,...,...,...
2033,3128,13171,GA,Lamar County,1,3142
2034,3129,40089,OK,McCurtain County,7,8299
2035,3130,30093,MT,Silver Bow County,5,5193
2036,3133,22047,LA,Iberville Parish,2,6615


In [4]:
# special keywords setting for specify the wikifier columns
# in most TA2 condition, it is not needed to speicified by hand, this is for TA3 using purpose
query_example = {'%^$#@wikifier@%^$#':{'FIPS': 'P882', 'State': 'Q35657'}}
meta_to_str = json.dumps(query_example)

# user-speicified keywords, or can also adapted from problemDoc.json for some seed augmentation problems
keywords = [
        "USDA",
        "economic research service",
        "ERS",
        "county-level",
        "socioeconomic indicators",
        "poverty rate",
        "education",
        "population",
        "unemployment"
      ]
# creat the search query, not really start search
query_search = datamart.DatamartQuery(keywords=keywords + [meta_to_str], variables=None)

In [5]:
datamart_url = "https://dsbox02.isi.edu:9000"
# if datamart_url not given, will use the default datamart url as this "https://dsbox02.isi.edu:9000"
datamart_unit = rest.RESTDatamart(connection_url=datamart_url)
# create search unit
"""
Current rest search support 4 types of control parameters
1. run_wikifier: bool, default is True
    if set to true, the system will find possible columns that can be wikifiered to get corresponding Q nodes in wikidata and 
    then a new columns will be added. This Q node column can be used for further augment. If set to false, the search speed
     will be quicker.
2. consider_wikifier_columns_only: bool, default is False
    if set to true, the system will only consider the Q node columns found from wikifier as join columns.
3. augment_with_time: bool, default is False
    if set to true, the system will auto generate join pairs base on 2 columns like (time_column, content_column). 
    This will return candidate datasets with both time and the contents are matched. If the supplied data do not contains any 
    time columns, the returned results will be empty. It would help when augmenting LL1_PHEM dataset.
4.  consider_time: bool, default is True
    Similar to augment_with_time, if set to true, the system will match the time ONLY. This is different from augment_with_time 
    which requires extra content column matches. If augment_with_time is set to true, this option will be useless.
    It would help when augmenting NY_TAXI dataset cause there is only a time column. 
"""
search_unit = datamart_unit.search_with_data(query=query_search, 
                                             supplied_data=input_dataset,
                                             run_wikifier=True,
                                             consider_wikifier_columns_only=True,
                                             augment_with_time=False,
                                             consider_time=False,)


In [6]:
# start search, it will take some time
all_results1 = search_unit.get_next_page()

In [7]:
if all_results1 is None:
    print("No search result returned!")
# print the brief information of the search results
else:
    rest.pretty_print_search_results(all_results1, to_std=True)

------------ Search result No.0 ------------
{'type': 'join', 'left_columns': [[[6]]], 'right_columns': [[[4]]]}
{'title': 'wikidata search result for FIPS_wikidata', 'Datamart ID': 'wikidata_search_on___P1082___P2046___P571___with_column_FIPS_wikidata', 'Score': '1', 'URL': 'None', 'Recommend Join Columns': 'FIPS_wikidata'}
----------------------------------------------------------------------------------------------------
------------ Search result No.1 ------------
{'type': 'join', 'left_columns': [[[6]]], 'right_columns': [[[4]]]}
{'title': 'vector search result for FIPS_wikidata', 'Datamart ID': 'vector_search_on_Q_nodes_with_column_FIPS_wikidata', 'Score': '1', 'URL': 'None', 'Recommend Join Columns': 'FIPS_wikidata', 'Number of Vectors': '2038'}
----------------------------------------------------------------------------------------------------
------------ Search result No.2 ------------
{'type': 'join', 'left_columns': [[[6]]], 'right_columns': [[[52]]]}
{'title': 'unemploymen

In [8]:
# augment example 1
# example: augment with wikidata
augment_hyperparams = hyper_augment.defaults()
augment_hyperparams = augment_hyperparams.replace({
    "search_result":all_results1[0].serialize(),
    "system_identifier":"ISI"
    })
augment_primitive = DataMartAugmentPrimitive(hyperparams=augment_hyperparams)
augment_result = augment_primitive.produce(inputs=input_dataset).value

In [9]:
augment_result['learningData']

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,population_for_FIPS_wikidata
0,1,13297,GA,Walton County,1,11385,Q498312,,1818-01-01T00:00:00Z,85754
1,2,13137,GA,Habersham County,6,6500,Q501096,723,1818-12-15T00:00:00Z,43300
2,13,5057,AR,Hempstead County,6,4483,Q61355,1920,1818-12-15T00:00:00Z,22474
3,146,34005,NJ,Burlington County,1,27083,Q138141,2122,1694-01-01T00:00:00Z,450838
4,1559,8109,CO,Saguache County,9,1757,Q312563,8206,1866-01-01T00:00:00Z,6208
...,...,...,...,...,...,...,...,...,...,...
2033,1550,37109,NC,Lincoln County,4,10179,Q507757,795,1778-01-01T00:00:00Z,79740
2034,1554,47149,TN,Rutherford County,1,31097,Q502348,1616,1803-10-25T00:00:00Z,281029
2035,1555,55081,WI,Monroe County,6,5384,Q932966,2352,1854-01-01T00:00:00Z,45298
2036,1556,53063,WA,Spokane County,2,64514,Q485276,4612,1858-01-01T00:00:00Z,479398


In [10]:
# augment example 2
# example: augment with wikidata
augment_hyperparams = hyper_augment.defaults()
augment_hyperparams = augment_hyperparams.replace({
    "search_result":all_results1[3].serialize(),
    "system_identifier":"ISI"
    })
augment_primitive = DataMartAugmentPrimitive(hyperparams=augment_hyperparams)
# augment base on the previous augmented result
augment_result2 = augment_primitive.produce(inputs=augment_result).value

In [11]:
augment_result2['learningData']

Unnamed: 0,d3mIndex,FIPS,State,Area,RUCCode,POVALL_2016,FIPS_wikidata,area_for_FIPS_wikidata,inception_for_FIPS_wikidata,population_for_FIPS_wikidata,...,POV04_2017,POV04_2017_wikidata,POV517_2017,POV517_2017_wikidata,POVALL_2017,Rural-urban_Continuum_Code_2003,Rural-urban_Continuum_Code_2013,State_wikidata,Urban_Influence_Code_2003,Urban_Influence_Code_2013
0,1,13297,GA,Walton County,1,11385,Q498312,,1818-01-01T00:00:00Z,85754,...,,Q814448,2616,Q33960590,11157,1.0,1.0,Q1428,1.0,1.0
1,2,13137,GA,Habersham County,6,6500,Q501096,723,1818-12-15T00:00:00Z,43300,...,,Q814448,1254,Q18971896,5585,6.0,6.0,Q1428,5.0,5.0
2,13,5057,AR,Hempstead County,6,4483,Q61355,1920,1818-12-15T00:00:00Z,22474,...,,Q814448,1309,Q18972773,5208,6.0,6.0,Q1612,5.0,6.0
3,146,34005,NJ,Burlington County,1,27083,Q138141,2122,1694-01-01T00:00:00Z,450838,...,,Q814448,5104,Q35696709,28202,1.0,1.0,Q1408,1.0,1.0
4,1559,8109,CO,Saguache County,9,1757,Q312563,8206,1866-01-01T00:00:00Z,6208,...,,Q814448,349,Q62253994,1608,9.0,9.0,Q1261,12.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2033,1550,37109,NC,Lincoln County,4,10179,Q507757,795,1778-01-01T00:00:00Z,79740,...,,Q814448,1887,Q62694558,10218,4.0,1.0,Q1454,3.0,1.0
2034,1554,47149,TN,Rutherford County,1,31097,Q502348,1616,1803-10-25T00:00:00Z,281029,...,,Q814448,6129,Q37236502,31372,1.0,1.0,Q1509,1.0,1.0
2035,1555,55081,WI,Monroe County,6,5384,Q932966,2352,1854-01-01T00:00:00Z,45298,...,,Q814448,1414,Q38736568,5093,6.0,6.0,Q1537,6.0,6.0
2036,1556,53063,WA,Spokane County,2,64514,Q485276,4612,1858-01-01T00:00:00Z,479398,...,,Q814448,12286,Q38752758,69334,2.0,2.0,Q1223,2.0,2.0
