In [1]:
import sys, os, json
sys.path.append(os.path.join(os.getcwd(), '..'))
from datamart.augment import Augment
from datamart.utilities.utils import Utils
import pandas as pd

In [2]:
es_index = "datamart_all"

augment = Augment(es_index=es_index)

### Read a sub set about taxi dataset

In [3]:
old_df = pd.read_csv("./example/taxi_example/taxi.csv")
print(old_df)

    d3mIndex tpep_pickup_datetime  num_pickups
0          0  2018-01-01 00:00:00           67
1          1  2018-01-01 01:00:00            8
2          2  2018-01-01 02:00:00            0
3          3  2018-01-01 03:00:00            0
4          4  2018-01-01 04:00:00            7
5          5  2018-01-01 05:00:00           10
6          6  2018-01-01 06:00:00            9
7          7  2018-01-01 07:00:00           28
8          8  2018-01-01 08:00:00          157
9          9  2018-01-01 09:00:00          259
10        10  2018-01-01 10:00:00          301
11        11  2018-01-01 11:00:00          436
12        12  2018-01-01 12:00:00          369
13        13  2018-01-01 13:00:00          347
14        14  2018-01-01 14:00:00          494
15        15  2018-01-01 15:00:00          544
16        16  2018-01-01 16:00:00          467
17        17  2018-01-01 17:00:00          690
18        18  2018-01-01 18:00:00          461
19        19  2018-01-01 19:00:00          465
20        20 

### Query
### Is there any dataset in Datamart has named_entity new york and is related to wind?
### Besides, it covers the date from 2018-01-01 to 2018-01-03

In [4]:
metadatas = augment.query(
    key_value_pairs=[
        ("variables.named_entity", "new york"),
        ("description", "wind"),
    ],
    temporal_coverage_start="2018-01-01",
    temporal_coverage_end="2018-01-03"
)

print(len(metadatas))


9


In [5]:
print([x["_source"]["datamart_id"] for x in metadatas])

[125530000, 124480000, 125480000, 125450000, 125150000, 124530000, 124620000, 124600000, 125030000]


### Materialize them with their metadata. Constrain should come from UI, will have UI for user to form such constrain.

In [6]:
new_dfs = []
for hitted in metadatas:
    named_entity_column = None
    for idx, variable in enumerate(hitted["_source"]["variables"]):
        if variable.get("named_entity", None):
            named_entity_column = idx
            break
    
    df = Utils.get_dataset(
        metadata=hitted["_source"],
        constrains={
            "named_entity": {
                named_entity_column: ["new york"]
            },
            "date_range": {
                "start": "2018-01-01T00:00:00",
                "end": "2018-01-02T23:00:00"
            }
        }
    )
    new_dfs.append(df)
    print("========{}========".format(hitted["_source"]["description"]))
    print(df)
    
    if len(df):
        print("\n - try to join with the queried one:")
        joined_df = augment.join(
            left_df=old_df,
            right_df=df,
            left_columns=[[1]],  # date column index of old_df
            right_columns=[[0]],  # date column index of new_df
            left_metadata=None,
            right_metadata=hitted["_source"],
            joiner="rltk"
        )
        print(joined_df)
    print("\n\n")


Empty DataFrame
Columns: [date, stationid, city, WDMV]
Index: []





Empty DataFrame
Columns: [date, stationid, city, WSFG]
Index: []





                  date          stationid      city  WDF2
0  2018-01-01T00:00:00  GHCND:USW00014732  new york   320
1  2018-01-02T00:00:00  GHCND:USW00014732  new york   270

 - try to join with the queried one:


    d3mIndex tpep_pickup_datetime  num_pickups          stationid      city  \
0          0  2018-01-01 00:00:00           67  GHCND:USW00014732  new york   
1          1  2018-01-01 01:00:00            8  GHCND:USW00014732  new york   
2          2  2018-01-01 02:00:00            0  GHCND:USW00014732  new york   
3          3  2018-01-01 03:00:00            0  GHCND:USW00014732  new york   
4          4  2018-01-01 04:00:00            7  GHCND:USW00014732  new york   
5          5  2018-01-01 05:00:00           10  GHCND:USW00014732  new york   
6          6  2018-01-01 06:00:00            9  GHCND:USW00014732  new york   
7          7  2018-01-01 07:00:00           28  GHCND:USW00014732  new york   
8          8  2018-01-01 08:00:00          157  GHCND:USW00014732  new york   
9          9  2018-01-01 09:00:00          259  GHCND:USW00014732  new york   
10        10  2018-01-01 10:00:00          301  GHCND:USW00014732  new york   
11        11  2018-01-01 11:00:00          436  GHCN

                  date          stationid      city  WSF2
0  2018-01-01T00:00:00  GHCND:USW00014732  new york   112
1  2018-01-02T00:00:00  GHCND:USW00014732  new york    98

 - try to join with the queried one:
    d3mIndex tpep_pickup_datetime  num_pickups          stationid      city  \
0          0  2018-01-01 00:00:00           67  GHCND:USW00014732  new york   
1          1  2018-01-01 01:00:00            8  GHCND:USW00014732  new york   
2          2  2018-01-01 02:00:00            0  GHCND:USW00014732  new york   
3          3  2018-01-01 03:00:00            0  GHCND:USW00014732  new york   
4          4  2018-01-01 04:00:00            7  GHCND:USW00014732  new york   
5          5  2018-01-01 05:00:00           10  GHCND:USW00014732  new york   
6          6  2018-01-01 06:00:00            9  GHCND:USW00014732  new york   
7          7  2018-01-01 07:00:00           28  GHCND:USW00014732  new york   
8          8  2018-01-01 08:00:00          157  GHCND:USW00014732  new york  

                  date          stationid      city  AWND
0  2018-01-01T00:00:00  GHCND:USW00014732  new york    76
1  2018-01-02T00:00:00  GHCND:USW00014732  new york    62

 - try to join with the queried one:
    d3mIndex tpep_pickup_datetime  num_pickups          stationid      city  \
0          0  2018-01-01 00:00:00           67  GHCND:USW00014732  new york   
1          1  2018-01-01 01:00:00            8  GHCND:USW00014732  new york   
2          2  2018-01-01 02:00:00            0  GHCND:USW00014732  new york   
3          3  2018-01-01 03:00:00            0  GHCND:USW00014732  new york   
4          4  2018-01-01 04:00:00            7  GHCND:USW00014732  new york   
5          5  2018-01-01 05:00:00           10  GHCND:USW00014732  new york   
6          6  2018-01-01 06:00:00            9  GHCND:USW00014732  new york   
7          7  2018-01-01 07:00:00           28  GHCND:USW00014732  new york   
8          8  2018-01-01 08:00:00          157  GHCND:USW00014732  new york  

Empty DataFrame
Columns: [date, stationid, city, WDFG]
Index: []





                  date          stationid      city  WDF5
0  2018-01-01T00:00:00  GHCND:USW00014732  new york   300
1  2018-01-02T00:00:00  GHCND:USW00014732  new york   270

 - try to join with the queried one:
    d3mIndex tpep_pickup_datetime  num_pickups          stationid      city  \
0          0  2018-01-01 00:00:00           67  GHCND:USW00014732  new york   
1          1  2018-01-01 01:00:00            8  GHCND:USW00014732  new york   
2          2  2018-01-01 02:00:00            0  GHCND:USW00014732  new york   
3          3  2018-01-01 03:00:00            0  GHCND:USW00014732  new york   
4          4  2018-01-01 04:00:00            7  GHCND:USW00014732  new york   
5          5  2018-01-01 05:00:00           10  GHCND:USW00014732  new york   
6          6  2018-01-01 06:00:00            9  GHCND:USW00014732  new york   
7          7  2018-01-01 07:00:00           28  GHCND:USW00014732  new york   
8          8  2018-01-01 08:00:00          157  GHCND:USW00014732  new york  

                  date          stationid      city  WSF5
0  2018-01-01T00:00:00  GHCND:USW00014732  new york   143
1  2018-01-02T00:00:00  GHCND:USW00014732  new york   139

 - try to join with the queried one:
    d3mIndex tpep_pickup_datetime  num_pickups          stationid      city  \
0          0  2018-01-01 00:00:00           67  GHCND:USW00014732  new york   
1          1  2018-01-01 01:00:00            8  GHCND:USW00014732  new york   
2          2  2018-01-01 02:00:00            0  GHCND:USW00014732  new york   
3          3  2018-01-01 03:00:00            0  GHCND:USW00014732  new york   
4          4  2018-01-01 04:00:00            7  GHCND:USW00014732  new york   
5          5  2018-01-01 05:00:00           10  GHCND:USW00014732  new york   
6          6  2018-01-01 06:00:00            9  GHCND:USW00014732  new york   
7          7  2018-01-01 07:00:00           28  GHCND:USW00014732  new york   
8          8  2018-01-01 08:00:00          157  GHCND:USW00014732  new york  

Empty DataFrame
Columns: [date, stationid, city, WSFI]
Index: []





### We get some datasets related to `wind` from NOAA, how to join with old dataframe?
### ISI is working on some join methods
### First version rltk joiner is working as above.