# [ Chapter 8 - Signals Boosting Models ]
# Signals Boosting

NOTE: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [5]:
import sys
from datetime import datetime
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch8").getOrCreate()
engine = get_engine()

## Keyword Search with No Signals Boosting

### Figure 8.1

In [19]:
query = "ipad"
collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 3,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "indent": "true",
      "sort": "score desc, upc asc"
    }
}

results = engine.search(collection, request)
display_product_search(query, engine.docs(results))

## Create Signals Boosts (Signals Aggregation)

### Basic Signals Boosting Model

In [7]:
def aggregate_signals(signals_collection, signals_aggregation_collection, signals_aggregation_query):

    engine.create_collection(signals_aggregation_collection)

    print("Aggregating Signals to Create Signals Boosts...")
    signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
    signals_boosting_opts={"zkhost": "aips-zk", "collection": signals_aggregation_collection, 
                           "gen_uniq_key": "true", "commit_within": "5000"}
    df = spark.read.format("solr").options(**signals_opts).load()
    df.createOrReplaceTempView("signals")

    spark.sql(signals_aggregation_query).write.format("solr").options(**signals_boosting_opts).mode("overwrite").save()
    print("Signals Aggregation Completed!")

In [8]:
signals_collection = "signals"
signals_aggregation_collection = "basic_signals_boosts"

basic_signals_aggregation_query = """
  select q.target as query, c.target as doc, count(c.target) as boost
    from signals c left join signals q on c.query_id = q.query_id
    where c.type = 'click' AND q.type = 'query'
    group by query, doc
    order by boost desc
  """

aggregate_signals(signals_collection, signals_aggregation_collection, basic_signals_aggregation_query)

Wiping 'basic_signals_boosts' collection
Status: Success
Creating 'basic_signals_boosts' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Search with Signals Boosts Applied

### Signals Boosting Query

In [20]:
query = "ipad"

def get_query_time_boosts(query, collection):
    boosts_query = {
        "query": "\"" + query + "\"",
        "fields": ["doc", "boost"],
        "limit": 10,
        "params": {
          "defType": "edismax",
          "qf": "query",
          "sort": "boost desc"
        }
    }

    signals_boosts = engine.docs(engine.search(collection, boosts_query))
    print("Boost Documents:")
    print(signals_boosts)

    product_boosts = ""
    for entry in signals_boosts:
        if len(product_boosts) > 0:  product_boosts += " "
        product_boosts += '"' + entry['doc'] + '"^' + str(entry['boost'])

    print(f"\nBoost Query: \n{product_boosts}\n")
    
    return product_boosts


def get_main_query(query, signals_boosts):    
    request = {
        "query": query,
        "fields": ["upc", "name", "manufacturer", "score"],
        "limit": 3,
        "params": {
          "qf": "name manufacturer longDescription",
          "defType": "edismax",
          "indent": "true",
          "sort": "score desc, upc asc",
          "boost": "sum(1,query({! df=upc v=$signals_boosting}))",
          "signals_boosting": signals_boosts
        }
    }
    
    return request

collection = "products"
signals_boosts = get_query_time_boosts(query, "basic_signals_boosts")
boosted_query = get_main_query(query, signals_boosts)
print("Main Query:")
print(boosted_query)

response = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(response))

Boost Documents:
[{'doc': '885909457588', 'boost': 966}, {'doc': '885909457595', 'boost': 205}, {'doc': '885909471812', 'boost': 202}, {'doc': '886111287055', 'boost': 109}, {'doc': '843404073153', 'boost': 73}, {'doc': '885909457601', 'boost': 62}, {'doc': '635753493559', 'boost': 62}, {'doc': '885909472376', 'boost': 61}, {'doc': '610839379408', 'boost': 29}, {'doc': '884962753071', 'boost': 28}]

Boost Query: 
"885909457588"^966 "885909457595"^205 "885909471812"^202 "886111287055"^109 "843404073153"^73 "885909457601"^62 "635753493559"^62 "885909472376"^61 "610839379408"^29 "884962753071"^28

Main Query:
{'query': 'ipad', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf': 'name manufacturer longDescription', 'defType': 'edismax', 'indent': 'true', 'sort': 'score desc, upc asc', 'boost': 'sum(1,query({! df=upc v=$signals_boosting}))', 'signals_boosting': '"885909457588"^966 "885909457595"^205 "885909471812"^202 "886111287055"^109 "843404073153"^73 "885909

## Raw Signals Boosts (Case-sensitive)

### Listing 8.1

In [21]:
query = "885909457588" #most popular iPad model

def show_raw_boosted_queries(collection):
    signals_boosts_query = {
        "query": "\"" + query + "\"",
        "fields": ["query", "boost"],
        "limit": 20,
        "params": {
          "defType": "edismax",
          "qf": "doc",
          "sort": "boost desc"
        }
    }
    response = engine.search(collection, signals_boosts_query)
    boosted_queries = ""
    for boost in engine.docs(response):
        boosted_queries += f'"{boost["query"]}" : {boost["boost"]}\n'
    print("Raw Boosted Queries")
    print(boosted_queries)
    
signals_boosting_collection = "basic_signals_boosts"    
show_raw_boosted_queries(signals_boosting_collection)
    

Raw Boosted Queries
"iPad" : 1050
"ipad" : 966
"Ipad" : 829
"iPad 2" : 509
"ipad 2" : 347
"Ipad2" : 261
"ipad2" : 238
"Ipad 2" : 213
"I pad" : 203
"i pad" : 133
"IPad" : 77
"Apple" : 76
"I pad 2" : 60
"apple ipad" : 55
"Apple iPad" : 53
"ipads" : 43
"tablets" : 42
"apple" : 41
"iPads" : 38
"i pad 2" : 38



## Normalized Signals Boosting (Case-insensitive)

### Listing 8.2

In [22]:
signals_collection = "signals"
signals_boosting_collection = "normalized_signals_boosts"

normalized_signals_aggregation_query = """
      select lower(q.target) as query, c.target as doc, count(c.target) as boost
        from signals c left join signals q on c.query_id = q.query_id
        where c.type = 'click' AND q.type = 'query'
        group by query, doc
        order by boost desc
        """

aggregate_signals(signals_collection, signals_boosting_collection,
                  normalized_signals_aggregation_query)

Wiping 'normalized_signals_boosts' collection
Status: Success
Creating 'normalized_signals_boosts' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [23]:
show_raw_boosted_queries(signals_boosting_collection)

Raw Boosted Queries



## Start Wars Search before User Manipulation

### Figure 8.2

In [25]:
query = "star wars"
collection = "products"
signals_boosts = get_query_time_boosts(query, "normalized_signals_boosts")
boosted_query = get_main_query(query, signals_boosts)
boosted_query["params"]["fq"] = "-upc:45626176"

search_results = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(search_results))

Boost Documents:
[{'doc': '45626176', 'boost': 5000}, {'doc': '024543742180', 'boost': 1490}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}]

Boost Query: 
"45626176"^5000 "024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32



## Simulating a malicious user

### Listing 8.3

In [26]:
collection = "signals"
spam_user = "u8675309"
spam_query = "star wars"
#doc for a "trash compactor" from someone who wants star wars is rubbish!
spam_signal_boost_doc_upc = "45626176" 

signal_docs = []
for num in range(5000): #generate 5,000 query and click signals
    query_id = f"u8675309_0_{num}"
    signal_docs.append({ 
        "query_id": query_id,
        "user": spam_user,
        "type":"query",
        "target": spam_query,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id": f"spam_signal_query_{num}"
    })
    signal_docs.append({ 
        "query_id": query_id,
        "user": spam_user,
        "type":"click",
        "target": spam_signal_boost_doc_upc,
        "signal_time": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id": f"spam_signal_click_{num}"
    })
engine.add_documents(collection, signal_docs)

#re-run the basic signals aggregation to process the malicious clicks
signals_aggregation_collection = "signals_boosts_with_spam"
aggregate_signals(collection, signals_aggregation_collection,
                  normalized_signals_aggregation_query)


Adding Documents to 'signals' collection
Wiping 'signals_boosts_with_spam' collection
Status: Success
Creating 'signals_boosts_with_spam' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Impact of Spam on Search Results

### Listing 8.4

In [27]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_with_spam")
boosted_query = get_main_query(query, signals_boosts)

response = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(response))

Boost Documents:
[{'doc': '45626176', 'boost': 5000}, {'doc': '024543742180', 'boost': 1490}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}]

Boost Query: 
"45626176"^5000 "024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32



## Fighting Signal Spam through User-level Deduplication

### Listing 8.5

In [28]:
#One Signal per User - Anti-Spam
signals_collection = "signals"
signals_aggregation_collection = "signals_boosts_anti_spam"

anti_spam_aggregation_query = """
  select query, doc, count(doc) as boost from (
    select c.user unique_user, lower(q.target) as query, c.target as doc, max(c.signal_time) as boost
    from signals c left join signals q on c.query_id = q.query_id
    where c.type = 'click' AND q.type = 'query'
    group by unique_user, query, doc
  ) as x
  group by query, doc
  order by boost desc
"""

aggregate_signals(signals_collection, signals_aggregation_collection,
                  anti_spam_aggregation_query)

Wiping 'signals_boosts_anti_spam' collection
Status: Success
Creating 'signals_boosts_anti_spam' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [30]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_anti_spam")
boosted_query = get_main_query(query, signals_boosts)
print(boosted_query)

response = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(response))

Boost Documents:
[{'doc': '024543742180', 'boost': 1489}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}, {'doc': '024543560067', 'boost': 30}]

Boost Query: 
"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30

{'query': 'star wars', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf': 'name manufacturer longDescription', 'defType': 'edismax', 'indent': 'true', 'sort': 'score desc, upc asc', 'boost': 'sum(1,query({! df=upc v=$signals_boosting}))', 'signals_boosting': '"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "0245437

## Mixing multiple signal types

### Listing 8.6

In [31]:
#One Signal per User - Anti-Spam

signals_collection="signals"
signals_aggregation_collection="signals_boosts_weighted_types"

mixed_signal_types_aggregation = """
select query, doc, ((1 * click_boost) + (10 * add_to_cart_boost) +
                    (25 * purchase_boost)) as boost
from (
  select query, doc, 
    sum(click) as click_boost,
    sum(add_to_cart) as add_to_cart_boost,
    sum(purchase) as purchase_boost
  from (  
      select lower(q.target) as query, cap.target as doc, 
        if(cap.type = 'click', 1, 0) as click, 
        if(cap.type = 'add-to-cart', 1, 0) as  add_to_cart, 
        if(cap.type = 'purchase', 1, 0) as purchase
      from signals cap left join signals q on cap.query_id = q.query_id
      where (cap.type != 'query' AND q.type = 'query')
    ) raw_signals
  group by query, doc
) as per_type_boosts
"""

aggregate_signals(signals_collection, signals_aggregation_collection,
                  mixed_signal_types_aggregation)

Wiping 'signals_boosts_weighted_types' collection
Status: Success
Creating 'signals_boosts_weighted_types' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Time Decay

### Listing 8.7

In [32]:
signals_collection="signals"
signals_boosting_collection="signals_boosts_time_weighted"

half_life_days = "30"
target_date = "2020-06-01" #date of latest signal in our dataset. In live system use now().
signal_weight = "1" #can make this a function to differentiate weights for different signal types

time_decay_aggregation = f"""
select query, doc, sum(time_weighted_boost) as boost from (
    select user, query, doc, {signal_weight} * pow(0.5, (datediff('{target_date}', signal_time) / {half_life_days})) as time_weighted_boost from (
        select c.user as user, lower(q.target) as query, c.target as doc, max(c.signal_time) as signal_time
        from signals c left join signals q on c.query_id = q.query_id
        where c.type = 'click' AND q.type = 'query' AND c.signal_time <= '{target_date}'
        group by c.user, q.target, c.target
    ) as raw_signals 
) as time_weighted_signals
group by query, doc
order by boost desc
"""

aggregate_signals(signals_collection, signals_boosting_collection,
                  time_decay_aggregation)

Wiping 'signals_boosts_time_weighted' collection
Status: Success
Creating 'signals_boosts_time_weighted' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [33]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_time_weighted")
boosted_query = get_main_query(query, signals_boosts)
print(boosted_query)

response = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(response))

Boost Documents:
[{'doc': '024543742180', 'boost': 50.27564958868424}, {'doc': '023272342654', 'boost': 14.386081175228192}, {'doc': '024543742074', 'boost': 10.978708113139781}, {'doc': '883929154012', 'boost': 10.22196625832087}, {'doc': '023272342630', 'boost': 8.08610476717647}, {'doc': '886973561621', 'boost': 7.196654524422255}, {'doc': '024543742098', 'boost': 6.411867877358416}, {'doc': '030206742121', 'boost': 6.0339109822265105}, {'doc': '014633169546', 'boost': 5.732256133253928}, {'doc': '708431390614', 'boost': 5.200090923395828}]

Boost Query: 
"024543742180"^50.27564958868424 "023272342654"^14.386081175228192 "024543742074"^10.978708113139781 "883929154012"^10.22196625832087 "023272342630"^8.08610476717647 "886973561621"^7.196654524422255 "024543742098"^6.411867877358416 "030206742121"^6.0339109822265105 "014633169546"^5.732256133253928 "708431390614"^5.200090923395828

{'query': 'star wars', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf'

## Index-time Signals Boosting

In [35]:
#Create collection with index-time signals boosts field
products_collection = "products_with_signals_boosts"
engine.create_collection(products_collection)
engine.upsert_boosts_field(products_collection, "signals_boosts")

#Load products
engine.populate_collection_from_csv(products_collection, "../data/retrotech/products.csv")

Wiping 'products_with_signals_boosts' collection
Status: Success
Creating 'products_with_signals_boosts' collection
Status: Success
Adding 'boosts' field type to collection
Status: Success
Adding 'signals_boosts' field to collection
Status: Success
Loading products_with_signals_boosts
products_with_signals_boosts Schema: 
root
 |-- upc: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- longDescription: string (nullable = true)

Status: Success


### Listing 8.8

In [36]:
#load the aggregates signals from one of our previous signals aggregations
signals_boosts_collection = "normalized_signals_boosts"
signals_boosts_opts = {"zkhost": "aips-zk", "collection": signals_boosts_collection}
df = spark.read.format("solr").options(**signals_boosts_opts).load()
df.createOrReplaceTempView(signals_boosts_collection)

#register the product table so we can load from it and save back to it with boosts added
products_collection = "products_with_signals_boosts" #delete this line later
products_read_boosts_opts = {"zkhost": "aips-zk", "collection": products_collection}
df = spark.read.format("solr").options(**products_read_boosts_opts).load()
df.createOrReplaceTempView(products_collection)

#insert all keywords with signals boosts for this document into a new "signals_boosts" field
boosts_query = f"""
SELECT p.*, b.signals_boosts from (
  SELECT doc, concat_ws(',',collect_list(concat(query, '|', boost))) as signals_boosts 
    FROM {signals_boosts_collection} GROUP BY doc
) b inner join {products_collection} p on p.upc = b.doc
"""

#save the products back to the products collection, with the updated signals boosts added
product_options = {"zkhost": "aips-zk", "collection": products_collection,
                   "gen_uniq_key": "true", "commit_within": "5000"}
spark.sql(boosts_query).write.format("solr").options(
  **product_options).mode("overwrite").save()
print("Completed!")

Completed!


## Index-time Signals Boosting Query

### Listing 8.9

In [37]:
def get_query(query, signals_boosts_field):    
    return {
        "query": query,
        "fields": ["upc", "name", "manufacturer", "score"],
        "limit": 3,
        "params": {
          "qf": "name manufacturer longDescription",
          "defType": "edismax",
          "indent": "true",
          "sort": "score desc, upc asc",
          "qf": "name manufacturer longDescription",
          "boost": f"payload({signals_boosts_field}, \"{query}\", 1, first)"
        }
    }
    
query = "ipad"
collection = "products_with_signals_boosts"
boosted_query = get_query(query, "signals_boosts")

response = engine.search(collection, boosted_query)
display_product_search(query, engine.docs(response))

Up next: Chapter 9 - [Personalized Search](../ch09/1.personalization.ipynb)