# [ Chapter 8 - Signals Boosting Models ]
# Signals Boosting

NOTE: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
import sys
from datetime import datetime
sys.path.append('..')
from aips import get_engine, display_product_search
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_sql
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Keyword Search with No Signals Boosting

### Figure 8.1

In [2]:
# %load -s product_search_request aips/search_requests
def product_search_request(query):
    return {"query": query,
            "query_fields": ["name", "manufacturer", "long_description"],
            "return_fields": ["upc", "name", "manufacturer", "score"],
            "limit": 5,
            "order_by": [("score", "desc"), ("upc", "asc")]}

In [3]:
products_collection = engine.get_collection("products")
query = "ipad"
request = product_search_request(query)
response = products_collection.search(**request)
display_product_search(query, response["docs"])

## Create Signals Boosts (Signals Aggregation)

### Basic Signals Boosting Model

In [4]:
def aggregate_signals(signals_collection, collection_name, signals_agg_query):
    print("Aggregating Signals to Create Signals Boosts...")
    create_view_from_collection(signals_collection, "signals")
    collection = engine.create_collection(collection_name)
    collection.write(from_sql(signals_agg_query))
    collection.commit()
    print("Signals Aggregation Completed!")
    return collection
    
basic_signals_aggregation_query = """
SELECT q.target AS query, c.target AS doc, COUNT(c.target) AS boost
FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
WHERE c.type = 'click' AND q.type = 'query'
GROUP BY query, doc
ORDER BY boost DESC
"""

signals_collection = engine.get_collection("signals")
signals_boosting_collection = aggregate_signals(signals_collection, "basic_signals_boosts", basic_signals_aggregation_query)
signals_collection.commit()

Aggregating Signals to Create Signals Boosts...


Wiping "basic_signals_boosts" collection
Creating "basic_signals_boosts" collection
Status: Success
Signals Aggregation Completed!


## Search with Signals Boosts Applied

### Signals Boosting Query

In [5]:
# %load -s search_for_boosts,create_boosts_query aips/search_requests
def search_for_boosts(query, collection, query_field="query"):
    boosts_request = {"query": f'{query}',
                      "query_fields": [query_field],
                      "return_fields": ["query", "doc", "boost"],
                      "limit": 20,
                      "order_by": [("boost", "desc")]}
    response = collection.search(**boosts_request)
    return response["docs"]

def create_boosts_query(boost_documents):
    print("Boost Documents:")
    print(boost_documents)
    boosts = " ".join([f'"{b["doc"]}"^{b["boost"]}' 
                       for b in boost_documents])
    print(f"\nBoost Query: \n{boosts}\n")
    return boosts

## Raw Signals Boosts (Case-sensitive)

### Listing 8.1

In [6]:
def show_raw_boosted_queries(signals_documents):
    boosted_queries = "\n".join([f'"{doc["query"]}" : {doc["boost"]}'
                                 for doc in signals_documents])
    print("Raw Boosted Queries")
    print(boosted_queries) 

signals_boosting_collection = engine.get_collection("basic_signals_boosts")
query = "885909457588" #most popular iPad model
signals_documents = search_for_boosts(query, signals_boosting_collection, "doc")
show_raw_boosted_queries(signals_documents)    

Raw Boosted Queries
"iPad" : 1050
"ipad" : 966
"Ipad" : 829
"iPad 2" : 509
"ipad 2" : 347
"Ipad2" : 261
"ipad2" : 238
"Ipad 2" : 213
"I pad" : 203
"i pad" : 133
"IPad" : 77
"Apple" : 76
"I pad 2" : 60
"apple ipad" : 55
"Apple iPad" : 53
"ipads" : 43
"tablets" : 42
"apple" : 41
"iPads" : 38
"i pad 2" : 38


## Normalized Signals Boosting (Case-insensitive)

### Listing 8.2

In [7]:
normalized_signals_aggregation_query = """
SELECT LOWER(q.target) AS query, c.target AS doc, COUNT(c.target) AS boost
FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
WHERE c.type = 'click' AND q.type = 'query'
GROUP BY query, doc
ORDER BY boost DESC"""

normalized_collection = \
  aggregate_signals(signals_collection, "normalized_signals_boosts",
                    normalized_signals_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "normalized_signals_boosts" collection
Creating "normalized_signals_boosts" collection
Status: Success
Signals Aggregation Completed!


In [8]:
query = "885909457588" #most popular iPad model
signals_documents = search_for_boosts(query, normalized_collection, "doc")
show_raw_boosted_queries(signals_documents)

Raw Boosted Queries
"ipad" : 2939
"ipad 2" : 1104
"ipad2" : 540
"i pad" : 341
"apple ipad" : 152
"ipads" : 123
"apple" : 118
"i pad 2" : 99
"tablets" : 67
"tablet" : 61
"ipad 1" : 52
"apple ipad 2" : 27
"hp touchpad" : 26
"ipaq" : 20
"i pad2" : 19
"wi" : 19
"apple computers" : 18
"apple i pad" : 15
"ipad 2 16gb" : 15
"samsung galaxy" : 14


## Start Wars Search before User Manipulation

### Figure 8.2

In [9]:
# %load -s create_boosts_query,boosted_product_search_request aips/search_requests
def create_boosts_query(boost_documents):
    print("Boost Documents:")
    print(boost_documents)
    boosts = " ".join([f'"{b["doc"]}"^{b["boost"]}' 
                       for b in boost_documents])
    print(f"\nBoost Query: \n{boosts}\n")
    return boosts

def boosted_product_search_request(query, collection, boost_field=None):
    signals_documents = search_for_boosts(query, collection)
    signals_boosts = create_boosts_query(signals_documents)
    boosted_request = product_search_request(query)
    if boost_field:
        signals_boosts = (boost_field, signals_boosts)
    boosted_request["query_boosts"] = signals_boosts
    return boosted_request

In [10]:
query = '"star wars"'
boosted_request = boosted_product_search_request(query, normalized_collection, "upc")
boosted_request["filter"] = [("-upc", "45626176")]

response = products_collection.search(**boosted_request)
display_product_search(query, response["docs"])

Boost Documents:
[{'query': 'star wars', 'doc': '024543742180', 'boost': 1490}, {'query': 'star wars', 'doc': '400032015667', 'boost': 186}, {'query': 'star wars', 'doc': '024543742074', 'boost': 127}, {'query': 'star wars', 'doc': '024543559856', 'boost': 117}, {'query': 'star wars', 'doc': '014633169546', 'boost': 107}, {'query': 'star wars', 'doc': '024543742098', 'boost': 81}, {'query': 'star wars', 'doc': '024543781875', 'boost': 67}, {'query': 'star wars', 'doc': '023272342654', 'boost': 39}, {'query': 'star wars', 'doc': '014633169522', 'boost': 32}, {'query': 'star wars', 'doc': '024543560067', 'boost': 30}, {'query': 'star wars', 'doc': '883929154012', 'boost': 24}, {'query': 'star wars', 'doc': '023272342630', 'boost': 20}, {'query': 'star wars', 'doc': '708431390614', 'boost': 18}, {'query': 'star wars', 'doc': '024543263739', 'boost': 18}, {'query': 'star wars', 'doc': '886973561621', 'boost': 17}, {'query': 'star wars', 'doc': '023272341381', 'boost': 16}, {'query': 'star 

## Simulating a malicious user

### Listing 8.3

In [11]:
signals_collection = engine.get_collection("signals")
spam_user = "u8675309"
spam_query = "star wars"
#doc for a "trash compactor" from someone who wants star wars is rubbish!
spam_signal_boost_doc_upc = "45626176" 

signal_docs = []
for num in range(5000): #generate 5,000 query and click signals
    query_id = f"u8675309_0_{num}"
    query_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type": "query",
        "target": spam_query,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id": f"spam_signal_query_{num}"
    }
    click_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type": "click",
        "target": spam_signal_boost_doc_upc,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id": f"spam_signal_click_{num}"
    }
    signal_docs.extend([click_signal, query_signal])

signals_collection.add_documents(signal_docs)
#re-run the basic signals aggregation to process the malicious clicks

spam_signals_collection = \
    aggregate_signals(signals_collection, "signals_boosts_with_spam",
                      normalized_signals_aggregation_query)


Adding Documents to 'signals' collection
Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_with_spam" collection
Creating "signals_boosts_with_spam" collection
Status: Success
Signals Aggregation Completed!


## Impact of Spam on Search Results

### Listing 8.4

In [12]:
query = '"star wars"'
boosted_request = boosted_product_search_request(query, spam_signals_collection, "upc")
response = products_collection.search(**boosted_request)
display_product_search(query, response["docs"])

Boost Documents:
[{'query': 'star wars', 'doc': '45626176', 'boost': 5000}, {'query': 'star wars', 'doc': '024543742180', 'boost': 1490}, {'query': 'star wars', 'doc': '400032015667', 'boost': 186}, {'query': 'star wars', 'doc': '024543742074', 'boost': 127}, {'query': 'star wars', 'doc': '024543559856', 'boost': 117}, {'query': 'star wars', 'doc': '014633169546', 'boost': 107}, {'query': 'star wars', 'doc': '024543742098', 'boost': 81}, {'query': 'star wars', 'doc': '024543781875', 'boost': 67}, {'query': 'star wars', 'doc': '023272342654', 'boost': 39}, {'query': 'star wars', 'doc': '014633169522', 'boost': 32}, {'query': 'star wars', 'doc': '024543560067', 'boost': 30}, {'query': 'star wars', 'doc': '883929154012', 'boost': 24}, {'query': 'star wars', 'doc': '023272342630', 'boost': 20}, {'query': 'star wars', 'doc': '708431390614', 'boost': 18}, {'query': 'star wars', 'doc': '024543263739', 'boost': 18}, {'query': 'star wars', 'doc': '886973561621', 'boost': 17}, {'query': 'star wa

## Fighting Signal Spam through User-level Deduplication

### Listing 8.5

In [13]:
#One Signal per User - Anti-Spam
anti_spam_aggregation_query = """
SELECT query, doc, COUNT(doc) AS boost FROM (
  SELECT c.user unique_user, LOWER(q.target) AS query, c.target AS doc,
  MAX(c.signal_time) AS boost
  FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
  WHERE c.type = 'click' AND q.type = 'query'
  GROUP BY unique_user, query, doc
) AS x
GROUP BY query, doc
ORDER BY boost DESC
"""

anti_spam_collection = \
    aggregate_signals(signals_collection, "signals_boosts_anti_spam",
                      anti_spam_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_anti_spam" collection
Creating "signals_boosts_anti_spam" collection
Status: Success
Signals Aggregation Completed!


In [14]:
query = '"star wars"'

boosted_request = boosted_product_search_request(query, anti_spam_collection)
response = products_collection.search(**boosted_request)

print(products_collection.transform_request(**boosted_request))
display_product_search(query, response["docs"])

Boost Documents:
[{'query': 'star wars', 'doc': '024543742180', 'boost': 1489}, {'query': 'star wars', 'doc': '400032015667', 'boost': 186}, {'query': 'star wars', 'doc': '024543742074', 'boost': 127}, {'query': 'star wars', 'doc': '024543559856', 'boost': 117}, {'query': 'star wars', 'doc': '014633169546', 'boost': 107}, {'query': 'star wars', 'doc': '024543742098', 'boost': 81}, {'query': 'star wars', 'doc': '024543781875', 'boost': 67}, {'query': 'star wars', 'doc': '023272342654', 'boost': 39}, {'query': 'star wars', 'doc': '014633169522', 'boost': 32}, {'query': 'star wars', 'doc': '024543560067', 'boost': 30}, {'query': 'star wars', 'doc': '883929154012', 'boost': 24}, {'query': 'star wars', 'doc': '023272342630', 'boost': 20}, {'query': 'star wars', 'doc': '708431390614', 'boost': 18}, {'query': 'star wars', 'doc': '024543263739', 'boost': 18}, {'query': 'star wars', 'doc': '886973561621', 'boost': 17}, {'query': 'star wars', 'doc': '023272341381', 'boost': 16}, {'query': 'star 

## Mixing multiple signal types

### Listing 8.6

In [18]:
#One Signal per User - Anti-Spam
mixed_signal_types_aggregation_query = """
SELECT query, doc, ((1 * click_boost) + (10 * add_to_cart_boost) +
                    (25 * purchase_boost)) AS boost FROM (
  SELECT query, doc, 
    SUM(click) AS click_boost,
    SUM(add_to_cart) AS add_to_cart_boost,
    SUM(purchase) AS purchase_boost FROM (  
      SELECT lower(q.target) AS query, cap.target AS doc, 
        IF(cap.type = 'click', 1, 0) AS click, 
        IF(cap.type = 'add-to-cart', 1, 0) AS  add_to_cart, 
        IF(cap.type = 'purchase', 1, 0) AS purchase
      FROM signals cap LEFT JOIN signals q on cap.query_id = q.query_id
      WHERE (cap.type != 'query' AND q.type = 'query')
    ) raw_signals
  GROUP BY query, doc
) AS per_type_boosts
"""

type_weighted_collection = \
  aggregate_signals(signals_collection, "signals_boosts_weighted_types",
                    mixed_signal_types_aggregation_query)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_weighted_types" collection
Creating "signals_boosts_weighted_types" collection
Status: Success
Signals Aggregation Completed!


## Time Decay

### Listing 8.7

In [19]:
half_life_days = "30"
target_date = "2020-06-01" #date of latest signal in our dataset. In live system use now().
signal_weight = "1" #can make this a function to differentiate weights for different signal types

time_decay_aggregation = f"""
SELECT query, doc, sum(time_weighted_boost) AS boost FROM (
    SELECT user, query, doc, {signal_weight} * POW(0.5, (DATEDIFF('{target_date}', signal_time) / {half_life_days})) AS time_weighted_boost FROM (
        SELECT c.user AS user, lower(q.target) AS query, c.target AS doc, MAX(c.signal_time) as signal_time
        FROM signals c LEFT JOIN signals q ON c.query_id = q.query_id
        WHERE c.type = 'click' AND q.type = 'query' AND c.signal_time <= '{target_date}'
        GROUP BY c.user, q.target, c.target
    ) AS raw_signals 
) AS time_weighted_signals
GROUP BY query, doc
ORDER BY boost DESC
"""

time_weighted_collection = \
    aggregate_signals(signals_collection, "signals_boosts_time_weighted", 
                      time_decay_aggregation)

Aggregating Signals to Create Signals Boosts...
Wiping "signals_boosts_time_weighted" collection
Creating "signals_boosts_time_weighted" collection
Status: Success
Signals Aggregation Completed!


In [20]:
query = '"star wars"'

boosted_request = boosted_product_search_request(query, time_weighted_collection)
print(products_collection.transform_request(**boosted_request))

response = products_collection.search(**boosted_request)
display_product_search(query, response["docs"])

Boost Documents:
[{'query': 'star wars', 'doc': '024543742180', 'boost': 50.27564958868424}, {'query': 'star wars', 'doc': '023272342654', 'boost': 14.38608117522819}, {'query': 'star wars', 'doc': '024543742074', 'boost': 10.97870811313978}, {'query': 'star wars', 'doc': '883929154012', 'boost': 10.22196625832087}, {'query': 'star wars', 'doc': '023272342630', 'boost': 8.086104767176469}, {'query': 'star wars', 'doc': '886973561621', 'boost': 7.196654524422255}, {'query': 'star wars', 'doc': '024543742098', 'boost': 6.411867877358414}, {'query': 'star wars', 'doc': '030206742121', 'boost': 6.0339109822265105}, {'query': 'star wars', 'doc': '014633169546', 'boost': 5.732256133253928}, {'query': 'star wars', 'doc': '708431390614', 'boost': 5.200090923395828}, {'query': 'star wars', 'doc': '024543559856', 'boost': 4.79595089562249}, {'query': 'star wars', 'doc': '023272341381', 'boost': 2.9199459597275252}, {'query': 'star wars', 'doc': '014633169522', 'boost': 2.2727988663619274}, {'que

## Index-time Signals Boosting

In [None]:
from aips.data_loaders.products import load_dataframe
#Create collection with index-time signals boosts field
boosted_products_collection = engine.create_collection("products_with_signals_boosts")
boosted_products_collection.write(load_dataframe("../data/retrotech/products.csv"))

Wiping "products_with_signals_boosts" collection
Creating "products_with_signals_boosts" collection
Adding "boosts" field type to collection
Status: Success
Loading data/retrotech/products.csv
Schema: 
root
 |-- upc: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- longDescription: string (nullable = true)



### Listing 8.8

In [None]:
#load the aggregates signals from one of our previous signals aggregations
normalized_collection = engine.get_collection("normalized_signals_boosts")
create_view_from_collection(normalized_collection, normalized_collection.name)

#register the product table so we can load from it and save back to it with boosts added
create_view_from_collection(boosted_products_collection, boosted_products_collection.name)

#insert all keywords with signals boosts for this document into a new "signals_boosts" field
boosts_query = f"""
SELECT p.*, b.signals_boosts FROM (
    SELECT doc, CONCAT_WS(',',COLLECT_LIST(CONCAT(query, '|', boost))) AS signals_boosts 
    FROM {normalized_collection.name} GROUP BY doc
) b INNER JOIN {boosted_products_collection.name} p ON p.upc = b.doc
"""

#save the products back to the products collection, with the updated signals boosts added
boosted_products_collection.write(from_sql(boosts_query))
print("Completed!")

Completed!


## Index-time Signals Boosting Query

### Listing 8.9

In [None]:
def get_query(query, boost_field):
    request = product_search_request(query)
    request["index_time_boost"] = (boost_field, query)
    return request    

In [None]:
query = "ipad"
boosted_query = get_query(query, "signals_boosts")
response = boosted_products_collection.search(**boosted_query)
display_product_search(query, response["docs"])

Up next: Chapter 9 - [Personalized Search](../ch09/1.personalization.ipynb)