# Signals Boosting

NOTE: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [13]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch4-signals-boosting").getOrCreate()

## Keyword Search with No Signals Boosting

### Figure 8.1

In [14]:
query = "ipad"

collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 3,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "indent": "true",
      "sort": "score desc, upc asc"
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))

## Create Signals Boosts (Signals Aggregation)

### Basic Signals Boosting Model

In [15]:
def aggregate_signals(signals_collection, signals_aggregation_collection, signals_aggregation_query):

    create_collection(signals_aggregation_collection)

    print("Aggregating Signals to Create Signals Boosts...")
    signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
    signals_boosting_opts={"zkhost": "aips-zk", "collection": signals_aggregation_collection, "gen_uniq_key": "true", "commit_within": "5000"}
    df = spark.read.format("solr").options(**signals_opts).load()
    df.registerTempTable("signals")


    spark.sql(signals_aggregation_query).write.format("solr").options(**signals_boosting_opts).mode("overwrite").save()
    print("Signals Aggregation Completed!")

In [16]:
signals_collection = "signals"
signals_aggregation_collection = "basic_signals_boosts"

basic_signals_aggregation_query = """
  select q.target as query, c.target as doc, count(c.target) as boost
    from signals c left join signals q on c.query_id = q.query_id
    where c.type = 'click' AND q.type = 'query'
    group by query, doc
    order by boost desc
  """

aggregate_signals(signals_collection, signals_aggregation_collection, basic_signals_aggregation_query)

Wiping 'basic_signals_boosts' collection
[('action', 'CREATE'), ('name', 'basic_signals_boosts'), ('numShards', 1), ('replicationFactor', 1)]
Creating basic_signals_boosts' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Search with Signals Boosts Applied

### Signals Boosting Query

In [17]:
query = "ipad"

def get_query_time_boosts(query, signals_boosting_collection):
    signals_boosts_query = {
        "query": "\"" + query + "\"",
        "fields": ["doc", "boost"],
        "limit": 10,
        "params": {
          "defType": "edismax",
          "qf": "query",
          "sort": "boost desc"
        }
    }

    signals_boosts = requests.post(solr_url + signals_boosting_collection + "/select", json=signals_boosts_query).json()["response"]["docs"]
    print("Boost Documents: \n")
    print(signals_boosts)

    product_boosts = ""
    for entry in signals_boosts:
        if len(product_boosts) > 0:  product_boosts += " "
        product_boosts += '"' + entry['doc'] + '"^' + str(entry['boost'])

    print("\nBoost Query: \n" + product_boosts + "\n")
    
    return product_boosts


def get_main_query(query, signals_boosts):    
    request = {
        "query": query,
        "fields": ["upc", "name", "manufacturer", "score"],
        "limit": 3,
        "params": {
          "qf": "name manufacturer longDescription",
          "defType": "edismax",
          "indent": "true",
          "sort": "score desc, upc asc",
          "qf": "name manufacturer longDescription",
          "boost": "sum(1,query({! df=upc v=$signals_boosting}))",
          "signals_boosting": signals_boosts
        }
    }
    
    return request

collection = "products"
signals_boosts = get_query_time_boosts(query, "basic_signals_boosts")
boosted_query = get_main_query(query, signals_boosts)
print("Main Query:")
print(boosted_query)

search_results = requests.post(solr_url + collection + "/select", json=boosted_query).json()["response"]["docs"]
print("\nSearch Results (Basic Signals Boosting): ")
print(search_results)
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '885909457588', 'boost': 966}, {'doc': '885909457595', 'boost': 205}, {'doc': '885909471812', 'boost': 202}, {'doc': '886111287055', 'boost': 109}, {'doc': '843404073153', 'boost': 73}, {'doc': '885909457601', 'boost': 62}, {'doc': '635753493559', 'boost': 62}, {'doc': '885909472376', 'boost': 61}, {'doc': '610839379408', 'boost': 29}, {'doc': '884962753071', 'boost': 28}]

Boost Query: 
"885909457588"^966 "885909457595"^205 "885909471812"^202 "886111287055"^109 "843404073153"^73 "885909457601"^62 "635753493559"^62 "885909472376"^61 "610839379408"^29 "884962753071"^28

Main Query:
{'query': 'ipad', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf': 'name manufacturer longDescription', 'defType': 'edismax', 'indent': 'true', 'sort': 'score desc, upc asc', 'boost': 'sum(1,query({! df=upc v=$signals_boosting}))', 'signals_boosting': '"885909457588"^966 "885909457595"^205 "885909471812"^202 "886111287055"^109 "843404073153"^73 "8859

## Raw Signals Boosts (Case-sensitive)

### Listing 8.1

In [18]:
query = "885909457588" #most popular iPad model

def show_raw_boosted_queries(signals_boosting_collection):
    signals_boosts_query = {
        "query": "\"" + query + "\"",
        "fields": ["query", "boost"],
        "limit": 20,
        "params": {
          "defType": "edismax",
          "qf": "doc",
          "sort": "boost desc"
        }
    }

    signals_boosts = requests.post(solr_url + signals_boosting_collection 
                     + "/select", json=signals_boosts_query).json()["response"]["docs"]

    boosted_queries = ""
    for entry in signals_boosts:
        boosted_queries += '"' + entry['query'] + '" : ' + str(entry['boost']) + "\n"

    print("Raw Boosted Queries")
    print(boosted_queries)
    
signals_boosting_collection = "basic_signals_boosts"    
show_raw_boosted_queries(signals_boosting_collection)
    

Raw Boosted Queries
"iPad" : 1050
"ipad" : 966
"Ipad" : 829
"iPad 2" : 509
"ipad 2" : 347
"Ipad2" : 261
"ipad2" : 238
"Ipad 2" : 213
"I pad" : 203
"i pad" : 133
"IPad" : 77
"Apple" : 76
"I pad 2" : 60
"apple ipad" : 55
"Apple iPad" : 53
"ipads" : 43
"tablets" : 42
"apple" : 41
"iPads" : 38
"i pad 2" : 38



## Normalized Signals Boosting (Case-insensitive)

### Listing 8.2

In [19]:
signals_collection = "signals"
signals_boosting_collection = "normalized_signals_boosts"

normalized_signals_aggregation_query = """
      select lower(q.target) as query, c.target as doc, count(c.target) as boost
        from signals c left join signals q on c.query_id = q.query_id
        where c.type = 'click' AND q.type = 'query'
        group by query, doc
        order by boost desc
        """

aggregate_signals(signals_collection, signals_boosting_collection, normalized_signals_aggregation_query)

Wiping 'normalized_signals_boosts' collection
[('action', 'CREATE'), ('name', 'normalized_signals_boosts'), ('numShards', 1), ('replicationFactor', 1)]
Creating normalized_signals_boosts' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [20]:
show_raw_boosted_queries(signals_boosting_collection)

Raw Boosted Queries
"ipad" : 2939
"ipad 2" : 1104
"ipad2" : 540
"i pad" : 341
"apple ipad" : 152
"ipads" : 123
"apple" : 118
"i pad 2" : 99
"tablets" : 67
"tablet" : 61
"ipad 1" : 52
"apple ipad 2" : 27
"hp touchpad" : 26
"ipaq" : 20
"i pad2" : 19
"wi" : 19
"apple computers" : 18
"apple i pad" : 15
"ipad 2 16gb" : 15
"samsung galaxy" : 14



## Start Wars Search before User Manipulation

### Figure 8.2

In [21]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "normalized_signals_boosts")
boosted_query = get_main_query(query, signals_boosts)
#print(boosted_query)

search_results = requests.post(solr_url + collection + "/select?fq=-upc:45626176", json=boosted_query).json()["response"]["docs"]
print(search_results)
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '024543742180', 'boost': 1490}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}, {'doc': '024543560067', 'boost': 30}]

Boost Query: 
"024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30

[{'upc': '400032015667', 'name': 'Star Wars - The Complete Saga - Blu-ray Disc', 'manufacturer': 'LucasFilm', 'score': 4542.926}, {'upc': '696055169191', 'name': 'Star Wars: Battlefront II - PSP', 'manufacturer': 'LucasArts', 'score': 6.6077175}, {'upc': '23272335397', 'name': 'Star Wars Battlefront: Elite Squadron - PSP', 'manufacturer': 'LucasArts', 'score': 6.327446}]


## Simulating a malicious user

### Listing 8.3

In [22]:
import datetime

spam_user = "u8675309"
spam_query = "star wars"

#doc for a "trash compactor" from someone who wants star wars is rubbish!
spam_signal_boost_doc_upc = "45626176" 


#generate 5,000 query and click signals
num = 0
while (num < 5000): #generate 5,000 query and click signals
    query_id = "u8675309_0_" + str(num)
    
    next_query_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type":"query",
        "target": spam_query,
        "signal_time": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id":"spam_signal_query_" + str(num)
    }
    
    next_click_signal = { 
        "query_id": query_id,
        "user": spam_user,
        "type":"click",
        "target": spam_signal_boost_doc_upc,
        "signal_time": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
        "id":"spam_signal_click_" + str(num)
    }
    
    collection = "signals"
    requests.post(solr_url + collection + "/update/json/docs", json=next_query_signal)
    requests.post(solr_url + collection + "/update/json/docs", json=next_click_signal)
    num+=1

#commit
requests.post(solr_url + collection + "/update/json/docs?commit=true")

#re-run the basic signals aggregation to process the malicious clicks
signals_collection = "signals"
signals_aggregation_collection = "signals_boosts_with_spam"
aggregate_signals(signals_collection, signals_aggregation_collection, normalized_signals_aggregation_query)

Wiping 'signals_boosts_with_spam' collection
[('action', 'CREATE'), ('name', 'signals_boosts_with_spam'), ('numShards', 1), ('replicationFactor', 1)]
Creating signals_boosts_with_spam' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Impact of Spam on Search Results

### Listing 8.4

In [23]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_with_spam")
boosted_query = get_main_query(query, signals_boosts)

search_results = requests.post(solr_url + collection + "/select", json=boosted_query).json()["response"]["docs"]
print(search_results)
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '45626176', 'boost': 5000}, {'doc': '024543742180', 'boost': 1490}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}]

Boost Query: 
"45626176"^5000 "024543742180"^1490 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32

[{'upc': '45626176', 'name': 'Trash Can (Star Wars Themed)', 'manufacturer': 'Jay Franco & Sons', 'score': 144994.0}, {'upc': '400032015667', 'name': 'Star Wars - The Complete Saga - Blu-ray Disc', 'manufacturer': 'LucasFilm', 'score': 4542.926}, {'upc': '696055169191', 'name': 'Star Wars: Battlefront II - PSP', 'manufacturer': 'LucasArts', 'score': 6.6077175}]


## Fighting Signal Spam through User-level Deduplication

### Listing 8.5

In [24]:
#One Signal per User - Anti-Spam
signals_collection = "signals"
signals_aggregation_collection = "signals_boosts_anti_spam"

anti_spam_aggregation_query = """
  select query, doc, count(doc) as boost from (
    select c.user unique_user, lower(q.target) as query, c.target as doc, max(c.signal_time) as boost
    from signals c left join signals q on c.query_id = q.query_id
    where c.type = 'click' AND q.type = 'query'
    group by unique_user, query, doc
  ) as x
  group by query, doc
  order by boost desc
"""

aggregate_signals(signals_collection, signals_aggregation_collection, anti_spam_aggregation_query)

Wiping 'signals_boosts_anti_spam' collection
[('action', 'CREATE'), ('name', 'signals_boosts_anti_spam'), ('numShards', 1), ('replicationFactor', 1)]
Creating signals_boosts_anti_spam' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [25]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_anti_spam")
boosted_query = get_main_query(query, signals_boosts)

print(boosted_query)

search_results = requests.post(solr_url + collection + "/select", json=boosted_query).json()["response"]["docs"]
print(search_results)
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '024543742180', 'boost': 1489}, {'doc': '400032015667', 'boost': 186}, {'doc': '024543742074', 'boost': 127}, {'doc': '024543559856', 'boost': 117}, {'doc': '014633169546', 'boost': 107}, {'doc': '024543742098', 'boost': 81}, {'doc': '024543781875', 'boost': 67}, {'doc': '023272342654', 'boost': 39}, {'doc': '014633169522', 'boost': 32}, {'doc': '024543560067', 'boost': 30}]

Boost Query: 
"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "024543742098"^81 "024543781875"^67 "023272342654"^39 "014633169522"^32 "024543560067"^30

{'query': 'star wars', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf': 'name manufacturer longDescription', 'defType': 'edismax', 'indent': 'true', 'sort': 'score desc, upc asc', 'boost': 'sum(1,query({! df=upc v=$signals_boosting}))', 'signals_boosting': '"024543742180"^1489 "400032015667"^186 "024543742074"^127 "024543559856"^117 "014633169546"^107 "02454

## Mixing multiple signal types

### Listing 8.6

In [26]:
#One Signal per User - Anti-Spam

signals_collection="signals"
signals_aggregation_collection="signals_boosts_weighted_types"

mixed_signal_types_aggregation = """
select query, doc, 
  ( (1 * click_boost) + (10 * add_to_cart_boost) + (25 * purchase_boost) ) as boost
from (
  select query, doc, 
    sum(click) as click_boost,
    sum(add_to_cart) as add_to_cart_boost,
    sum(purchase) as purchase_boost
  from (  
      select lower(q.target) as query, cap.target as doc, 
        if(cap.type = 'click', 1, 0) as click, 
        if(cap.type = 'add-to-cart', 1, 0) as  add_to_cart, 
        if(cap.type = 'purchase', 1, 0) as purchase
      from signals cap left join signals q on cap.query_id = q.query_id
      where (cap.type != 'query' AND q.type = 'query')
    ) raw_signals
  group by query, doc
) as per_type_boosts
"""

aggregate_signals(signals_collection, signals_aggregation_collection, mixed_signal_types_aggregation)

Wiping 'signals_boosts_weighted_types' collection
[('action', 'CREATE'), ('name', 'signals_boosts_weighted_types'), ('numShards', 1), ('replicationFactor', 1)]
Creating signals_boosts_weighted_types' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


## Time Decay

### Listing 8.7

In [27]:
signals_collection="signals"
signals_boosting_collection="signals_boosts_time_weighted"

half_life_days = "30"
target_date = "2020-06-01 00:00:00.0000" #date of latest signal in our dataset. In live system use now().
signal_weight = "1" #can make this a function to differentiate weights for different signal types

time_decay_aggregation = """
select query, doc, sum(time_weighted_boost) as boost from (
    select user, query, doc, """ + signal_weight + """ * pow(0.5, (datediff('""" + target_date + "', signal_time) / " + half_life_days + """)) as time_weighted_boost from (
        select c.user as user, lower(q.target) as query, c.target as doc, max(c.signal_time) as signal_time
        from signals c left join signals q on c.query_id = q.query_id
        where c.type = 'click' AND q.type = 'query' AND c.signal_time <= '""" + target_date + """'
        group by c.user, q.target, c.target
    ) as raw_signals 
) as time_weighted_signals
group by query, doc
order by boost desc
"""

aggregate_signals(signals_collection, signals_boosting_collection, time_decay_aggregation)

## Mixing multiple signal types

Wiping 'signals_boosts_time_weighted' collection
[('action', 'CREATE'), ('name', 'signals_boosts_time_weighted'), ('numShards', 1), ('replicationFactor', 1)]
Creating signals_boosts_time_weighted' collection
Status: Success
Aggregating Signals to Create Signals Boosts...
Signals Aggregation Completed!


In [28]:
query = "star wars"
collection = "products"

signals_boosts = get_query_time_boosts(query, "signals_boosts_time_weighted")
boosted_query = get_main_query(query, signals_boosts)

print(boosted_query)

search_results = requests.post(solr_url + collection + "/select", json=boosted_query).json()["response"]["docs"]
print(search_results)
display(HTML(render_search_results(query, search_results)))

Boost Documents: 

[{'doc': '024543742180', 'boost': 50.27564958868423}, {'doc': '023272342654', 'boost': 14.386081175228192}, {'doc': '024543742074', 'boost': 10.978708113139781}, {'doc': '883929154012', 'boost': 10.221966258320872}, {'doc': '023272342630', 'boost': 8.08610476717647}, {'doc': '886973561621', 'boost': 7.1966545244222555}, {'doc': '024543742098', 'boost': 6.411867877358414}, {'doc': '030206742121', 'boost': 6.0339109822265105}, {'doc': '014633169546', 'boost': 5.732256133253928}, {'doc': '708431390614', 'boost': 5.200090923395828}]

Boost Query: 
"024543742180"^50.27564958868423 "023272342654"^14.386081175228192 "024543742074"^10.978708113139781 "883929154012"^10.221966258320872 "023272342630"^8.08610476717647 "886973561621"^7.1966545244222555 "024543742098"^6.411867877358414 "030206742121"^6.0339109822265105 "014633169546"^5.732256133253928 "708431390614"^5.200090923395828

{'query': 'star wars', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params':

## Index-time Signals Boosting

In [29]:
#Create collection with index-time signals boosts field
products_collection="products_with_signals_boosts"
create_collection(products_collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(products_collection, "upc")
upsert_text_field(products_collection, "name")
upsert_text_field(products_collection, "longDescription")
upsert_text_field(products_collection, "manufacturer")

#add signals boosting field
signals_boosts_field = "signals_boosts"
upsert_boosts_field(products_collection, signals_boosts_field)

#Load products
print("Loading Products with Signals Boosts...")
csvFile = "../data/retrotech/products.csv"
csvDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(csvFile)

product_update_opts={"zkhost": "aips-zk", "collection": products_collection, "gen_uniq_key": "true", "commit_within": "5000"}
csvDF.write.format("solr").options(**product_update_opts).mode("overwrite").save()
print("Products Schema: ")
csvDF.printSchema()
print("Status: Success")

Wiping 'products_with_signals_boosts' collection
[('action', 'CREATE'), ('name', 'products_with_signals_boosts'), ('numShards', 1), ('replicationFactor', 1)]
Creating products_with_signals_boosts' collection
Status: Success
Adding 'upc' field to collection
Status: Success
Adding 'name' field to collection
Status: Success
Adding 'longDescription' field to collection
Status: Success
Adding 'manufacturer' field to collection
Status: Success
Adding 'boosts' field type to collection
Status: Success
Adding 'signals_boosts' field to collection
Status: Success
Loading Products with Signals Boosts...
Products Schema: 
root
 |-- upc: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- longDescription: string (nullable = true)

Status: Success


### Listing 8.8

In [225]:
#load the aggregates signals from one of our previous signals aggregations
signals_boosts_collection="normalized_signals_boosts"
signals_boosts_opts={"zkhost": "aips-zk", "collection": signals_boosts_collection}
df = spark.read.format("solr").options(**signals_boosts_opts).load()
df.registerTempTable(signals_boosts_collection)

#register the product table so we can load from it and save back to it with boosts added
products_collection="products_with_signals_boosts" #delete this line later
products_read_boosts_opts={"zkhost": "aips-zk", "collection": products_collection}
df = spark.read.format("solr").options(**products_read_boosts_opts).load()
df.registerTempTable(products_collection)

#insert all keywords with signals boosts for this document into a new "signals_boosts" field
boosts_query = """
SELECT p.*, b.signals_boosts from (
  SELECT doc, concat_ws(',',collect_list(concat(query, '|', boost))) as signals_boosts FROM """ + signals_boosts_collection + """ GROUP BY doc
) b inner join """ + products_collection + """ p on p.upc = b.doc
"""

#save the products back to the products collection, with the updated signals boosts added
products_write_boosts_opts={"zkhost": "aips-zk", "collection": products_collection, "gen_uniq_key": "true", "commit_within": "5000"}
spark.sql(boosts_query).write.format("solr").options(**products_write_boosts_opts).mode("overwrite").save()
print("Completed!")

Completed!


## Index-time Signals Boosting Query

### Listing 8.9

In [5]:
query = "ipad"

def get_query(query, signals_boosts_field):    
    request = {
        "query": query,
        "fields": ["upc", "name", "manufacturer", "score"],
        "limit": 3,
        "params": {
          "qf": "name manufacturer longDescription",
          "defType": "edismax",
          "indent": "true",
          "sort": "score desc, upc asc",
          "qf": "name manufacturer longDescription",
          "boost": "payload(" + signals_boosts_field + ", \"" + query + "\", 1, first)"
        }
    }
    
    return request

collection = "products_with_signals_boosts"
boosted_query = get_query(query, signals_boosts_field)
print("Main Query:")
print(boosted_query)

search_results = requests.post(solr_url + collection + "/select", json=boosted_query).json()["response"]["docs"]
print("\nSearch Results (Basic Signals Boosting): ")
print(search_results)
display(HTML(render_search_results(query, search_results)))

Main Query:
{'query': 'ipad', 'fields': ['upc', 'name', 'manufacturer', 'score'], 'limit': 3, 'params': {'qf': 'name manufacturer longDescription', 'defType': 'edismax', 'indent': 'true', 'sort': 'score desc, upc asc', 'boost': 'payload(signals_boosts, "ipad", 1, first)'}}

Search Results (Basic Signals Boosting): 
[{'manufacturer': 'Apple&#xAE;', 'name': 'Apple&#xAE; - iPad&#xAE; 2 with Wi-Fi - 16GB - Black', 'upc': '885909457588', 'score': 6851.6655}, {'manufacturer': 'Apple&#xAE;', 'name': 'Apple&#xAE; - iPad&#xAE; 2 with Wi-Fi - 32GB - Black', 'upc': '885909457595', 'score': 1503.683}, {'manufacturer': 'Apple&#xAE;', 'name': 'Apple&#xAE; - iPad&#xAE; 2 with Wi-Fi - 16GB - White', 'upc': '885909471812', 'score': 1480.3701}]


In [10]:
def upsert_tf_boosts_field(collection_name, field_name, field_type_name="tf_boosts"):
    
    #clear out old field to ensure this function is idempotent
    delete_field = {"delete-field":{ "name":field_name }}
    response = requests.post(solr_url + collection_name + "/schema", json=delete_field).json()

    upsert_tf_boosts_field_type(collection_name, field_type_name);
    
    print("Adding '" + field_name + "' field to collection")
    add_field = {"add-field":{ "name":field_name, "type":"boosts", "stored":"true", "indexed":"true", "multiValued":"true" }}
    response = requests.post(solr_url + collection_name + "/schema", json=add_field).json()
    print_status(response)
    
upsert_tf_boosts_field("products_with_signals_boosts", "tf_boosts")

Adding 'tf_boosts' field type to collection
Adding 'tf_boosts' field to collection
Status: Success


In [9]:
def upsert_tf_boosts_field_type(collection_name, field_type_name):
    delete_field_type = {"delete-field-type":{ "name":field_type_name }}
    response = requests.post(solr_url + collection_name + "/schema", json=delete_field_type).json()

    print("Adding '" + field_type_name + "' field type to collection")
    add_field_type = { 
        "add-field-type" : {
            "name": field_type_name,
            "class":"solr.TextField",
            "positionIncrementGap":"100",
            "analyzer" : {
                "tokenizer": {
                    "class":"solr.PatternTokenizerFactory",
                    "pattern": "," },
                 "filters":[
                    { "class":"solr.LowerCaseFilterFactory" },
                    { "class":"solr.DelimitedTermFrequencyFilterFactory", "delimiter": "|", "encoder": "float" }]}}}
    

In [24]:
upsert_boosts_field("")

Adding 'boosts' field type to collection
Status: Success
Adding 'signals_boosts' field to collection
Status: Success


In [17]:
#signals_boosts_collection="signals_boosts"
products_collection="products"
create_collection(products_collection)

products_opts={"zkhost": "aips-zk", "collection": products_collection}
df = spark.read.format("solr").options(**products_opts).load()
df.registerTempTable(products_collection)
df.registerTempTable("products")

signals_collection="signals"
signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
df = spark.read.format("solr").options(**signals_opts).load()
df.registerTempTable("signals")

products_query = "select p.*, s. from products"

signals_boosting_opts={"zkhost": "aips-zk", "collection": products_with_boosts_collection, "gen_uniq_key": "true", "commit_within": "5000"}
spark.sql(products_query).write.format("solr").options(**signals_boosting_opts).mode("overwrite").save()
print("Signals Aggregation Completed!")

Wiping 'products_with_signals_boosts_test' collection
[('action', 'CREATE'), ('name', 'products_with_signals_boosts_test'), ('numShards', 1), ('replicationFactor', 1)]
Creating products_with_signals_boosts_test' collection
Status: Success
Signals Aggregation Completed!
