In [1]:
import sys, json, time, os
sys.path.append(sys.path.append(os.path.join(os.getcwd(), '..')))
from datamart.index_builder import IndexBuilder
from datamart.query_manager import QueryManager

Create es index

In [2]:
es_index = "datamart_tmp"

qm = QueryManager()

if qm.check_exists(index=es_index):
    qm.delete_index(index=[es_index])
qm.create_index(index=es_index)

In [3]:
tmp_description = "tmp/tmp.json"
tmp_out = "tmp/tmp_metadata.out"

A sample dastaset schema file from NOAA data provider

In [4]:
print(json.dumps(json.load(open(tmp_description)), indent=2))

{
  "title": "TAVG",
  "description": "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]",
  "url": "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt",
  "keywords": [
    "Average Temperature."
  ],
  "provenance": "noaa.org",
  "materialization": {
    "python_path": "noaa_materializer",
    "arguments": {
      "type": "TAVG"
    }
  },
  "variables": [
    {
      "name": "date",
      "description": "the date of data",
      "semantic_type": [
        "https://metadata.datadrivendiscovery.org/types/Time"
      ],
      "temporal_coverage": {
        "start": "1994-03-19T00:00:00",
        "end": "1996-05-28T00:00:00"
      }
    },
    {
      "name": "stationId",
      "description": "the id of station which has this data",
      "semantic_type": [
        "https://metadata.datadrivendiscovery.org/types/CategoricalData"
      ]
    },
    {
      "name": "city",

Create metadata

By reading the json description file and profile the original dataset 

Query to get original dataset, current profiler only record name entities if a column contains name entities

In [5]:
index_builder = IndexBuilder()
this_metadata = index_builder.indexing(description_path=tmp_description,
                                       data_path=None,
                                       query_data_for_indexing=True)

A sample metadata for NOAA

In [6]:
print(json.dumps(this_metadata, indent=2))

{
  "datamart_id": 0,
  "title": "TAVG",
  "description": "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]",
  "url": "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt",
  "keywords": [
    "Average Temperature."
  ],
  "date_published": null,
  "date_updated": null,
  "provenance": "noaa.org",
  "original_identifier": null,
  "materialization": {
    "python_path": "noaa_materializer",
    "arguments": {
      "type": "TAVG"
    }
  },
  "variables": [
    {
      "datamart_id": 1,
      "name": "date",
      "description": "the date of data",
      "semantic_type": [
        "https://metadata.datadrivendiscovery.org/types/Time"
      ],
      "temporal_coverage": {
        "start": "1994-03-19T00:00:00",
        "end": "1996-05-28T00:00:00"
      }
    },
    {
      "datamart_id": 2,
      "name": "stationId",
      "description": "the id of station which has thi

In [7]:
qm.create_doc(index='datamart_tmp', doc_type='document', body=this_metadata, id=this_metadata['datamart_id'])
time.sleep(1)

In [8]:
query_1 = json.dumps({
    "query": {
        "bool": {
            "must": [
                {
                    "match": {
                        "variables.named_entity": "los angeles"
                    }
                }
            ]
        }
    }
})

In [9]:
hitted_metadatas = qm.search(index=es_index, body=query_1)

print(len(hitted_metadatas))

1


In [10]:
for metadata in hitted_metadatas:
    print("====== HIT a metadata ======")
    print(json.dumps(metadata, indent=2))
    print("\n\n")
    print("====== GET the dataset ======")
    df = qm.get_dataset(metadata=metadata)
    print(df.iloc[:10, :])
    print("\n\n")

{
  "datamart_id": 0,
  "title": "TAVG",
  "description": "Average temperature (tenths of degrees C)[Note that TAVG from source 'S' corresponds to an average for the period ending at 2400 UTC rather than local midnight]",
  "url": "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt",
  "keywords": [
    "Average Temperature."
  ],
  "date_published": null,
  "date_updated": null,
  "provenance": "noaa.org",
  "original_identifier": null,
  "materialization": {
    "python_path": "noaa_materializer",
    "arguments": {
      "type": "TAVG"
    }
  },
  "variables": [
    {
      "datamart_id": 1,
      "name": "date",
      "description": "the date of data",
      "semantic_type": [
        "https://metadata.datadrivendiscovery.org/types/Time"
      ],
      "temporal_coverage": {
        "start": "1994-03-19T00:00:00",
        "end": "1996-05-28T00:00:00"
      }
    },
    {
      "datamart_id": 2,
      "name": "stationId",
      "description": "the id of station which has thi

                  date          stationid         city TAVG
0  2018-09-24T00:00:00  GHCND:USR0000CACT  los angeles  206
1  2018-09-24T00:00:00  GHCND:USR0000CBEV  los angeles  186
2  2018-09-24T00:00:00  GHCND:USR0000CCHB  los angeles  185
3  2018-09-24T00:00:00  GHCND:USR0000CCHI  los angeles  208
4  2018-09-24T00:00:00  GHCND:USR0000CCLE  los angeles  217
5  2018-09-24T00:00:00  GHCND:USR0000CCP9  los angeles  207
6  2018-09-24T00:00:00  GHCND:USR0000CLTU  los angeles  181
7  2018-09-24T00:00:00  GHCND:USR0000CMAL  los angeles  163
8  2018-09-24T00:00:00  GHCND:USR0000CMIL  los angeles  204
9  2018-09-24T00:00:00  GHCND:USR0000CSFD  los angeles  204



