In [1]:
from aips import get_engine, set_engine
from aips.spark.dataframe import from_sql
from aips.spark import create_view_from_collection
import tqdm

set_engine("opensearch")
engine = get_engine()

In [2]:
#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb

In [3]:
def execute_search(collection, signal, log=False):
    request = {"query": signal["target"],
               "query_fields": ["name", "manufacturer",
                                "long_description", "short_description"],
               "return_fields": ["*"],
               "limit": 10,
               "ext": {"ubi": signal}}
    results = collection.search(**request, log=log)
    return results

In [4]:
def get_events_dataframe():
    signals_collection = engine.get_collection("signals")
    create_view_from_collection(signals_collection, "signals")
    events = from_sql("SELECT * FROM signals WHERE type != 'query'")
    return events

In [5]:
events_collection = engine.create_collection("ubi_events")
ubi_events_dataframe = get_events_dataframe()
events_collection.write(ubi_events_dataframe)

Wiping "ubi_events" collection
Creating "ubi_events" collection
Successfully written 1447146 documents


In [7]:
def get_queries_dataframe():
    signals_collection = engine.get_collection("signals")
    create_view_from_collection(signals_collection, "signals")
    queries = from_sql("SELECT * FROM signals WHERE type = 'query'")
    queries_transformed = queries.rdd.map(lambda r: 
        (r["signal_time"], r["query_id"], r["user"], r["target"]))
    ubi_queries_dataframe = queries_transformed.toDF(
        ["timestamp", "query_id", "client_id", "target"])
    return ubi_queries_dataframe

products_collection = engine.get_collection("products")
ubi_queries_dataframe = get_queries_dataframe()
for q in tqdm.tqdm(ubi_queries_dataframe.collect(), total=ubi_queries_dataframe.count()):
    execute_search(products_collection, q.asDict(), log=True)

  0%|          | 0/725459 [00:00<?, ?it/s]

Search Request:
{
  "query": {
    "query_string": {
      "query": "picture frame",
      "boost": 0.454545454,
      "fields": [
        "name",
        "manufacturer",
        "long_description",
        "short_description"
      ]
    }
  },
  "size": 10,
  "fields": [
    "*"
  ],
  "ext": {
    "ubi": {
      "timestamp": 1588829699599,
      "query_id": "u363983_2_4",
      "client_id": "u363983",
      "target": "picture frame"
    }
  }
}





ValueError: {'error': {'root_cause': [{'type': 'named_object_not_found_exception', 'reason': '[1:200] unknown field [ubi]'}], 'type': 'named_object_not_found_exception', 'reason': '[1:200] unknown field [ubi]'}, 'status': 400}

In [None]:
def write_events_to_collection(collection):
    ubi_events_collection = engine.get_collection("ubi_events")
    create_view_from_collection(ubi_events_collection, "ubi_events")
    events = from_sql("SELECT * FROM ubi_events")
    events_transformed = events.rdd.map(lambda r: 
        (r["timestamp"], r["query_id"], r["client_id"], r["target"], "query"))
    events_dataframe = events_transformed.toDF(
        ["signal_time", "query_id", "user", "target", "type"])
    collection.write(events_dataframe)

def write_queries_to_collection(collection):
    ubi_queries_collection = engine.get_collection("ubi_queries")
    create_view_from_collection(ubi_queries_collection, "ubi_queries")
    queries = from_sql("SELECT * FROM ubi_queries")
    queries_transformed = queries.rdd.map(lambda r: 
        (r["timestamp"], r["query_id"], r["client_id"], r["message"], r["message_type"]))
    queries_dataframe = queries_transformed.toDF(
        ["signal_time", "query_id", "user", "target", "type"])
    collection.write(queries_dataframe)

signals_collection = engine.create_collection("signals")
write_events_to_collection(signals_collection)
write_queries_to_collection(signals_collection)