# [ Chapter 8 - Signals Boosting Models ]
# Signals Boosting

NOTE: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
from datetime import datetime
from pyspark.sql import SparkSession

from aips import display_product_search, get_engine
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_sql

spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()
signals_collection = engine.get_collection("signals")

## Keyword Search with No Signals Boosting

### Figure 8.1

In [2]:
#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb

In [3]:
# %load -s product_search_request aips/search_requests
def product_search_request(query, param_overrides={}):
    request = {"query": query,
               "query_fields": ["name", "manufacturer", "long_description"],
               "return_fields": ["upc", "name", "manufacturer",
                                 "short_description", "score"],
               "limit": 5,
               "order_by": [("score", "desc"), ("upc", "asc")]}
    return request | param_overrides

In [4]:
products_collection = engine.get_collection("products")
query = "ipad"
request = product_search_request(query)
response = products_collection.search(**request)
display_product_search(query, response["docs"])

## Create Signals Boosts (Signals Aggregation)

### Basic Signals Boosting Model

In [5]:
def aggregate_signals(signals_collection, collection_name, signals_agg_query):
    print("Aggregating Signals to Create Signals Boosts...")
    create_view_from_collection(signals_collection, "signals")
    collection = engine.create_collection(collection_name)
    collection.write(from_sql(signals_agg_query))
    collection.commit()
    print("Signals Aggregation Completed!")
    return collection

## Search with Signals Boosts Applied

### Signals Boosting Query

## Raw Signals Boosts (Case-sensitive)

### Listing 8.1

In [6]:
# %load -s search_for_boosts aips/search_requests
def search_for_boosts(query, collection, query_field="query"):
    boosts_request = {"query": query,
                      "query_fields": [query_field],
                      "return_fields": ["query", "doc", "boost"],
                      "limit": 20,
                      "order_by": [("boost", "desc")]}
    response = collection.search(**boosts_request)
    return response["docs"]

In [7]:
def create_boosting_collection(collection_name):
    basic_signals_aggregation_query = """
    SELECT q.target AS query, c.target AS doc,
    COUNT(c.target) AS boost
    FROM signals c LEFT JOIN signals q
    ON c.query_id = q.query_id
    WHERE c.type = 'click' AND q.type = 'query'
    GROUP BY q.target, doc
    ORDER BY boost DESC
    """
    collection = engine.get_collection(collection_name)
    return aggregate_signals(collection, "basic_signals_boosts", basic_signals_aggregation_query)

In [8]:
def show_raw_boosted_queries(boost_documents):
    boosted_queries = "\n".join([f'"{doc["query"]}" : {doc["boost"]}'
                                 for doc in boost_documents])
    print("Raw Boosted Queries")
    print(boosted_queries)

In [9]:
signals_boosting_collection = create_boosting_collection("signals")

Aggregating Signals to Create Signals Boosts...
Wiping "basic_signals_boosts" collection
Creating "basic_signals_boosts" collection
Status: Success
Successfully written 197169 documents
Signals Aggregation Completed!


In [10]:
query = "885909457588" #most popular iPad model
signals_docs = search_for_boosts(query, signals_boosting_collection, "doc")
show_raw_boosted_queries(signals_docs)    

Raw Boosted Queries
"iPad" : 1050
"ipad" : 966
"Ipad" : 829
"iPad 2" : 509
"ipad 2" : 347
"Ipad2" : 261
"ipad2" : 238
"Ipad 2" : 213
"I pad" : 203
"i pad" : 133
"IPad" : 77
"Apple" : 76
"I pad 2" : 60
"apple ipad" : 55
"Apple iPad" : 53
"ipads" : 43
"tablets" : 42
"apple" : 41
"iPads" : 38
"i pad 2" : 38


## Normalized Signals Boosting (Case-insensitive)

### Listing 8.2

In [11]:
normalized_signals_aggregation_query = """
SELECT LOWER(q.target) AS query,
c.target AS doc, COUNT(c.target) AS boost
FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
WHERE c.type = 'click' AND q.type = 'query'
GROUP BY LOWER(q.target), doc
ORDER BY boost DESC"""

normalized_collection = \
  aggregate_signals(signals_collection, "normalized_signals_boosts",
                    normalized_signals_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "normalized_signals_boosts" collection
Creating "normalized_signals_boosts" collection
Status: Success
Successfully written 152252 documents
Signals Aggregation Completed!


In [12]:
query = "885909457588" #most popular iPad model
signals_docs = search_for_boosts(query, normalized_collection, "doc")
show_raw_boosted_queries(signals_docs)

Raw Boosted Queries
"ipad" : 2939
"ipad 2" : 1104
"ipad2" : 540
"i pad" : 341
"apple ipad" : 152
"ipads" : 123
"apple" : 118
"i pad 2" : 99
"tablets" : 67
"tablet" : 61
"ipad 1" : 52
"apple ipad 2" : 27
"hp touchpad" : 26
"ipaq" : 20
"i pad2" : 19
"wi" : 19
"apple computers" : 18
"apple i pad" : 15
"ipad 2 16gb" : 15
"samsung galaxy" : 14


## Star Wars Search before User Manipulation

### Figure 8.2

In [13]:
# %load -s create_boosts_query,boosted_product_search_request aips/search_requests
def create_boosts_query(boost_documents):
    print("Boost Documents:")
    display(boost_documents)
    boosts = " ".join([f'"{b["doc"]}"^{b["boost"]}' 
                       for b in boost_documents])
    print(f"\nBoost Query: \n{boosts}\n")
    return boosts

In [14]:
def boosted_product_search_request(query, collection, boost_field=None):
    signals_documents = search_for_boosts(query, collection)
    signals_boosts = create_boosts_query(signals_documents)
    boosted_request = product_search_request(query)
    if boost_field:
        signals_boosts = (boost_field, signals_boosts)
    boosted_request["query_boosts"] = signals_boosts
    return boosted_request

In [15]:
query = '"star wars"'
boosted_request = boosted_product_search_request(query, normalized_collection, "upc")
boosted_request["filter"] = [("-upc", "45626176")]

response = products_collection.search(**boosted_request)
display_product_search("star wars", response["docs"])

Boost Documents:


[{'query': 'star wars', 'doc': '45626176', 'boost': 5000},
 {'query': 'star wars', 'doc': '024543742180', 'boost': 1490},
 {'query': 'star wars', 'doc': '400032015667', 'boost': 186},
 {'query': 'star wars', 'doc': '024543742074', 'boost': 127},
 {'query': 'star wars', 'doc': '024543559856', 'boost': 117},
 {'query': 'star wars', 'doc': '014633169546', 'boost': 107},
 {'query': 'star wars', 'doc': '024543742098', 'boost': 81},
 {'query': 'star wars', 'doc': '024543781875', 'boost': 67},
 {'query': 'star wars', 'doc': '023272342654', 'boost': 39},
 {'query': 'star wars', 'doc': '014633169522', 'boost': 32},
 {'query': 'star wars', 'doc': '024543560067', 'boost': 30},
 {'query': 'star wars', 'doc': '883929154012', 'boost': 24},
 {'query': 'star wars', 'doc': '023272342630', 'boost': 20},
 {'query': 'star wars', 'doc': '708431390614', 'boost': 18},
 {'query': 'star wars', 'doc': '024543263739', 'boost': 18},
 {'query': 'star wars', 'doc': '886973561621', 'boost': 17},
 {'query': 'star war


Boost Query: 
"45626176"^5000 "024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30 "883929154012"^24 "023272342630"^20 "708431390614"^18 "024543263739"^18 "886973561621"^17 "023272341381"^16 "030206742121"^14 "885370332889"^12 "883929106172"^8



## Simulating a malicious user

### Listing 8.3

In [16]:
signals_collection = engine.get_collection("signals")
spam_user = "u8675309"
spam_query = "star wars"
spam_signal_boost_doc_upc = "45626176" #doc for a "trash compactor" from someone who wants star wars is rubbish!

signal_docs = []
for num in range(5000): #generate 5,000 query and click signals
    query_id = f"u8675309_0_{num}"
    query_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type": "query",
        "target": spam_query,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    }
    click_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type": "click",
        "target": spam_signal_boost_doc_upc,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    }
    signal_docs.extend([click_signal, query_signal])

signals_collection.add_documents(signal_docs)
#re-run the basic signals aggregation to process the malicious clicks

spam_signals_collection = \
    aggregate_signals(signals_collection, "signals_boosts_with_spam",
                      normalized_signals_aggregation_query)


Adding Documents to 'signals' collection
Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_with_spam" collection
Creating "signals_boosts_with_spam" collection
Status: Success
Successfully written 152252 documents
Signals Aggregation Completed!


## Impact of Spam on Search Results

### Listing 8.4

In [17]:
# function boosted_product_search_request defined above before figure 8.2 code

query = '"star wars"'
boosted_request = boosted_product_search_request(query,
                      spam_signals_collection, "upc")
response = products_collection.search(**boosted_request)
display_product_search(query, response["docs"])

Boost Documents:


[{'query': 'star wars', 'doc': '45626176', 'boost': 20000},
 {'query': 'star wars', 'doc': '024543742180', 'boost': 1490},
 {'query': 'star wars', 'doc': '400032015667', 'boost': 186},
 {'query': 'star wars', 'doc': '024543742074', 'boost': 127},
 {'query': 'star wars', 'doc': '024543559856', 'boost': 117},
 {'query': 'star wars', 'doc': '014633169546', 'boost': 107},
 {'query': 'star wars', 'doc': '024543742098', 'boost': 81},
 {'query': 'star wars', 'doc': '024543781875', 'boost': 67},
 {'query': 'star wars', 'doc': '023272342654', 'boost': 39},
 {'query': 'star wars', 'doc': '014633169522', 'boost': 32},
 {'query': 'star wars', 'doc': '024543560067', 'boost': 30},
 {'query': 'star wars', 'doc': '883929154012', 'boost': 24},
 {'query': 'star wars', 'doc': '023272342630', 'boost': 20},
 {'query': 'star wars', 'doc': '708431390614', 'boost': 18},
 {'query': 'star wars', 'doc': '024543263739', 'boost': 18},
 {'query': 'star wars', 'doc': '886973561621', 'boost': 17},
 {'query': 'star wa


Boost Query: 
"45626176"^20000 "024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30 "883929154012"^24 "023272342630"^20 "708431390614"^18 "024543263739"^18 "886973561621"^17 "023272341381"^16 "030206742121"^14 "885370332889"^12 "886111287055"^8



## Fighting Signal Spam through User-level Deduplication

### Listing 8.5

In [18]:
#One Signal per User - Anti-Spam
anti_spam_aggregation_query = """
SELECT query, doc, COUNT(doc) AS boost FROM (
  SELECT c.user unique_user, LOWER(q.target) AS query, c.target AS doc,
  MAX(c.signal_time) AS boost
  FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
  WHERE c.type = 'click' AND q.type = 'query'
  GROUP BY unique_user, LOWER(q.target), doc)
GROUP BY query, doc
ORDER BY boost DESC"""

anti_spam_collection = \
    aggregate_signals(signals_collection, "signals_boosts_anti_spam",
                      anti_spam_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_anti_spam" collection
Creating "signals_boosts_anti_spam" collection
Status: Success
Successfully written 152252 documents
Signals Aggregation Completed!


In [19]:
query = '"star wars"'

boosted_request = boosted_product_search_request(query, anti_spam_collection)
response = products_collection.search(**boosted_request)

print(products_collection.transform_request(**boosted_request))
display_product_search(query, response["docs"])

Boost Documents:


[{'query': 'star wars', 'doc': '024543742180', 'boost': 1489},
 {'query': 'star wars', 'doc': '400032015667', 'boost': 186},
 {'query': 'star wars', 'doc': '024543742074', 'boost': 127},
 {'query': 'star wars', 'doc': '024543559856', 'boost': 117},
 {'query': 'star wars', 'doc': '014633169546', 'boost': 107},
 {'query': 'star wars', 'doc': '024543742098', 'boost': 81},
 {'query': 'star wars', 'doc': '024543781875', 'boost': 67},
 {'query': 'star wars', 'doc': '023272342654', 'boost': 39},
 {'query': 'star wars', 'doc': '014633169522', 'boost': 32},
 {'query': 'star wars', 'doc': '024543560067', 'boost': 30},
 {'query': 'star wars', 'doc': '883929154012', 'boost': 24},
 {'query': 'star wars', 'doc': '023272342630', 'boost': 20},
 {'query': 'star wars', 'doc': '708431390614', 'boost': 18},
 {'query': 'star wars', 'doc': '024543263739', 'boost': 18},
 {'query': 'star wars', 'doc': '886973561621', 'boost': 17},
 {'query': 'star wars', 'doc': '023272341381', 'boost': 16},
 {'query': 'star w


Boost Query: 
"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30 "883929154012"^24 "023272342630"^20 "708431390614"^18 "024543263739"^18 "886973561621"^17 "023272341381"^16 "030206742121"^14 "885370332889"^12 "883929106172"^8 "886111287055"^8

{'query': '"star wars"', 'limit': 5, 'params': {'qf': ['name', 'manufacturer', 'long_description'], 'boost': 'sum(1,query($boost_query))', 'boost_query': '"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30 "883929154012"^24 "023272342630"^20 "708431390614"^18 "024543263739"^18 "886973561621"^17 "023272341381"^16 "030206742121"^14 "885370332889"^12 "883929106172"^8 "886111287055"^8'}, 'fields': ['upc', 'name', 'manufacturer', 'short_description', 'score'], 'sort': 'score desc, upc 

## Mixing multiple signal types

### Listing 8.6

In [20]:
#One Signal per User - Anti-Spam
#Sometimes needs rerunning
mixed_signal_types_aggregation_query = """
SELECT query, doc, ((1 * click_boost) + (10 * add_to_cart_boost) +
                    (25 * purchase_boost)) AS boost FROM (
  SELECT query, doc, 
    SUM(click) AS click_boost,
    SUM(add_to_cart) AS add_to_cart_boost,
    SUM(purchase) AS purchase_boost FROM (  
      SELECT lower(q.target) AS query, cap.target AS doc, 
        IF(cap.type = 'click', 1, 0) AS click, 
        IF(cap.type = 'add-to-cart', 1, 0) AS  add_to_cart, 
        IF(cap.type = 'purchase', 1, 0) AS purchase
      FROM signals cap LEFT JOIN signals q on cap.query_id = q.query_id
      WHERE (cap.type != 'query' AND q.type = 'query')
    ) AS raw_signals
  GROUP BY query, doc) AS per_type_boosts"""

type_weighted_collection = \
  aggregate_signals(signals_collection, "signals_boosts_weighted_types",
                    mixed_signal_types_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_weighted_types" collection
Creating "signals_boosts_weighted_types" collection
Status: Success
Successfully written 152252 documents
Signals Aggregation Completed!


## Time Decay

### Listing 8.7

In [27]:
half_life_days = 30
target_date = "2020-06-01" #date of latest signal in our dataset. In live system use now().
signal_weight = 1 #can make this a function to differentiate weights for different signal types

time_decay_aggregation = f"""
SELECT query, doc, SUM(time_weighted_boost) AS boost FROM (
  SELECT user, query, doc,
  {signal_weight} * POW(0.5, (DATEDIFF('{target_date}', raw_signals.t) / {half_life_days}))
  AS time_weighted_boost FROM (
    SELECT c.user AS user, lower(q.target) AS query, c.target AS doc,
    MAX(c.signal_time) as t
    FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
    WHERE c.type = 'click' AND q.type = 'query'
          AND c.signal_time <= '{target_date}'
    GROUP BY c.user, q.target, c.target
  ) AS raw_signals 
) AS time_weighted_signals
GROUP BY query, doc
ORDER BY boost DESC"""
time_weighted_collection = \
    aggregate_signals(signals_collection, "signals_boosts_time_weighted", 
                      time_decay_aggregation)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_time_weighted" collection
Creating "signals_boosts_time_weighted" collection
Status: Success
Successfully written 152251 documents
Signals Aggregation Completed!


In [28]:
query = '"star wars"'

boosted_request = boosted_product_search_request(query, time_weighted_collection)
print(products_collection.transform_request(**boosted_request))

response = products_collection.search(**boosted_request)
display_product_search(query, response["docs"])

Boost Documents:


[{'query': 'star wars', 'doc': '024543742180', 'boost': 50.275649588684246},
 {'query': 'star wars', 'doc': '023272342654', 'boost': 14.386081175228192},
 {'query': 'star wars', 'doc': '024543742074', 'boost': 10.978708113139781},
 {'query': 'star wars', 'doc': '883929154012', 'boost': 10.22196625832087},
 {'query': 'star wars', 'doc': '023272342630', 'boost': 8.08610476717647},
 {'query': 'star wars', 'doc': '886973561621', 'boost': 7.196654524422255},
 {'query': 'star wars', 'doc': '024543742098', 'boost': 6.411867877358415},
 {'query': 'star wars', 'doc': '030206742121', 'boost': 6.0339109822265105},
 {'query': 'star wars', 'doc': '014633169546', 'boost': 5.7322561332539275},
 {'query': 'star wars', 'doc': '708431390614', 'boost': 5.200090923395828},
 {'query': 'star wars', 'doc': '024543559856', 'boost': 4.795950895622491},
 {'query': 'star wars', 'doc': '023272341381', 'boost': 2.9199459597275252},
 {'query': 'star wars', 'doc': '014633169522', 'boost': 2.2727988663619274},
 {'que


Boost Query: 
"024543742180"^50.275649588684246 "023272342654"^14.386081175228192 "024543742074"^10.978708113139781 "883929154012"^10.22196625832087 "023272342630"^8.08610476717647 "886973561621"^7.196654524422255 "024543742098"^6.411867877358415 "030206742121"^6.0339109822265105 "014633169546"^5.7322561332539275 "708431390614"^5.200090923395828 "024543559856"^4.795950895622491 "023272341381"^2.9199459597275252 "014633169522"^2.2727988663619274 "886971404722"^2.244924096618746 "024543560067"^1.6538835179903377 "400032015667"^1.448048337976181 "883929153992"^0.9339199409434634 "023272341633"^0.8908987181403393 "024543781875"^0.7423932331629598 "024543212768"^0.7405487761432821

{'query': '"star wars"', 'limit': 5, 'params': {'qf': ['name', 'manufacturer', 'long_description'], 'boost': 'sum(1,query($boost_query))', 'boost_query': '"024543742180"^50.275649588684246 "023272342654"^14.386081175228192 "024543742074"^10.978708113139781 "883929154012"^10.22196625832087 "023272342630"^8.086104

In [23]:
from aips.data_loaders.products import load_dataframe
boosted_products_collection = engine.create_collection("products_with_signals_boosts")
boosted_products_collection.write(load_dataframe("data/retrotech/products.csv"))

Wiping "products_with_signals_boosts" collection
Creating "products_with_signals_boosts" collection
Adding "boosts" field type to collection
Status: Success
Loading Products
Schema: 
root
 |-- upc: string (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- short_description: string (nullable = true)
 |-- long_description: string (nullable = true)

Successfully written 48194 documents


### Listing 8.8

In [24]:
from aips.data_loaders import index_time_boosts

boosts_collection = engine.get_collection("normalized_signals_boosts")
boosted_products_collection = \
    engine.get_collection("products_with_signals_boosts")

boosted_products = index_time_boosts.load_dataframe(
    products_collection,
    boosts_collection)

boosted_products_collection.write(boosted_products)

Successfully written 18721 documents


## Index-time Signals Boosting Query

### Listing 8.9

In [25]:
def get_boosted_search_request(query, boost_field):
    request = product_search_request(query)
    request["index_time_boost"] = (boost_field, query)
    return request

In [26]:
query = "ipad"
boosted_query = get_boosted_search_request(query, "signals_boosts")
response = boosted_products_collection.search(**boosted_query)
display_product_search(query, response["docs"])

Up next: Chapter 9 - [Personalized Search](../ch09/1.personalization.ipynb)