In [1]:
from aips import get_engine, set_engine
from aips.spark.dataframe import from_sql
from aips.spark import create_view_from_collection
import tqdm

set_engine("opensearch")
engine = get_engine()

In [None]:
#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb

### Step 1 - Install and configure the OpenSearch UBI plugin

bin/opensearch-plugin install https://github.com/o19s/opensearch-ubi/releases/download/release-v0.0.12.1-os2.14.0/opensearch-ubi-plugin-v0.0.12.1-os2.14.0.zip --batch

### Step 2 - Ingest query data by adding the `ext` object to search requests

In [1]:
import requests

response = requests.get(f"http://opensearch-node1:9200/products/_search",
                        json={"ext": {"ubi": {"query_id": "1234"}}})
display(response.json())

ConnectionError: HTTPConnectionPool(host='opensearch-node1', port=9200): Max retries exceeded with url: /products/_search (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f3345fc8460>: Failed to establish a new connection: [Errno -2] Name or service not known'))

### Step 3 - Bulk ingest events

In [5]:
def get_events_dataframe():
    signals_collection = engine.get_collection("signals")
    create_view_from_collection(signals_collection, "signals")
    query = """SELECT type AS action_name, query_id, user AS client_id,
                      signal_time AS timestamp, type AS message_type,
                      target AS message, target AS target
               FROM signals WHERE type != 'query'"""
    events = from_sql(query)
    return events

In [6]:
events_collection = engine.get_collection("ubi_aips_events")
ubi_events_dataframe = get_events_dataframe()
events_collection.write(ubi_events_dataframe)

Successfully written 1447146 documents


In [3]:
def get_queries_dataframe():
    signals_collection = engine.get_collection("signals")
    create_view_from_collection(signals_collection, "signals")
    queries = from_sql("SELECT * FROM signals WHERE type = 'query'")
    queries_transformed = queries.rdd.map(lambda r: 
        (r["signal_time"], r["query_id"], r["user"], r["target"]))
    ubi_queries_dataframe = queries_transformed.toDF(
        ["timestamp", "query_id", "client_id", "user_query"])
    return ubi_queries_dataframe

In [11]:
def execute_search(collection, signal, log=False):
    request = {"query": signal["user_query"],
               "query_fields": ["name", "manufacturer",
                                "long_description", "short_description"],
               "return_fields": ["*"],
               "limit": 10,
               "ubi": signal | {"store_name": "aips_store"}}
    try:
        return collection.search(**request)
    except:
        pass

In [12]:
products_collection = engine.get_collection("products")
ubi_queries_dataframe = get_queries_dataframe()
for q in tqdm.tqdm(ubi_queries_dataframe.collect(), total=ubi_queries_dataframe.count()):
    execute_search(products_collection, q.asDict())

In [None]:
def batch_ingest_queries():
    pass

In [8]:
queries_collection = engine.create_collection("ubi_queries")
ubi_queries_dataframe = get_queries_dataframe()
queries_collection.write(ubi_queries_dataframe)

In [9]:
def create_events_dataframe():
    ubi_events_collection = engine.get_collection("ubi_events")
    create_view_from_collection(ubi_events_collection, "ubi_events")
    events = from_sql("SELECT * FROM ubi_events")
    events_transformed = events.rdd.map(lambda r: 
        (r["timestamp"], r["query_id"], r["client_id"],
         r["message"], r["message_type"]))
    return events_transformed.toDF(["signal_time", "query_id", "user", "target", "type"])

def create_queries_dataframe():
    ubi_queries_collection = engine.get_collection("ubi_queries")
    create_view_from_collection(ubi_queries_collection, "ubi_queries")
    queries = from_sql("SELECT * FROM ubi_queries")
    queries_transformed = queries.rdd.map(lambda r: 
        (r["timestamp"], r["query_id"], r["client_id"],
         r["user_query"], "query"))
    return queries_transformed.toDF(["signal_time", "query_id", "user", "target", "type"])

signals_collection = engine.create_collection("signals")
queries = create_queries_dataframe()
events = create_events_dataframe()
signals_collection.write(queries.union(events))