# [ Chapter 9 - Personalized Search ]
# Personalized Search

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook or execute the next cell uncommented.

In [1]:
import random
from datetime import datetime

from pyspark.conf import SparkConf
from pyspark.ml.feature import IndexToString, StringIndexer
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

from aips import display_product_search, get_engine
from aips.spark import create_view_from_collection
from aips.spark.dataframe import from_sql

engine = get_engine()

In [2]:
#Recommended for making ALS run faster, if you have enough memory / cores allocated to docker
conf = SparkConf()
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.dynamicAllocation.enabled", "true")
conf.set("spark.dynamicAllocation.executorMemoryOverhead", "8g")
spark = SparkSession.builder.appName("AIPS-ch9").config(conf=conf).getOrCreate()

In [3]:
#%run chapters/ch04/1.setting-up-the-retrotech-dataset.ipynb

In [4]:
### Load product data
products_collection = engine.get_collection("products")
create_view_from_collection(products_collection, "products", spark)

# Collaborative Filtering with Implicit Preferences

In [5]:
def aggregate_signals(signals_collection, signals_agg_collection_name, query):
    aggregated_collection = engine.create_collection(signals_agg_collection_name)
    print("Aggregating Signals to Create Signals Boosts...")
    create_view_from_collection(signals_collection, "signals", spark)
    aggregated_collection.write(from_sql(query, spark))
    print("Signals Aggregation Completed!")
    return aggregated_collection

## Listing 9.1

In [6]:
click_weight = 1
add_to_cart_weight = 0 ##increase to consider add-to-cart signals
purchase_weight = 0 ##increase to consider purchase signals

signals_collection = engine.get_collection("signals")

mixed_signal_types_aggregation = f"""
SELECT user, product,
  (click_boost + add_to_cart_boost + purchase_boost) AS rating
FROM (
  SELECT user, product, 
    SUM(click) AS click_boost,
    SUM(add_to_cart) AS add_to_cart_boost,
    SUM(purchase) AS purchase_boost
  FROM (  
    SELECT s.user, s.target AS product, 
      IF(s.type = 'click', {click_weight}, 0) AS click, 
      IF(s.type = 'add-to-cart', {add_to_cart_weight}, 0) AS add_to_cart,
      IF(s.type = 'purchase', {purchase_weight}, 0) AS purchase
    FROM signals s 
    WHERE (s.type != 'query')) AS raw_signals
  GROUP BY user, product) AS per_type_boosts"""

signals_agg_collection = \
  aggregate_signals(signals_collection, "user_product_implicit_preferences",
                    mixed_signal_types_aggregation)

Wiping "user_product_implicit_preferences" collection
Creating "user_product_implicit_preferences" collection
Aggregating Signals to Create Signals Boosts...
Successfully written 647441 documents
Signals Aggregation Completed!


## Listing 9.2

In [7]:
create_view_from_collection(signals_agg_collection, "user_product_implicit_preferences", spark)

##50K = all products
#This will take long time. Recommend setting to 1,000 if trying to just run through code
#without considering all products

top_product_count_for_recs = 50000 #1000 for older computers 
user_preference_query = f"""
SELECT user, product, rating
FROM user_product_implicit_preferences
WHERE product IN (
  SELECT product FROM (
    SELECT product, COUNT(user) user_count
    FROM user_product_implicit_preferences
    GROUP BY product
    ORDER BY user_count DESC
    LIMIT {top_product_count_for_recs}
  ) AS top_products)   
ORDER BY rating DESC"""

user_prefs = spark.sql(user_preference_query)

## Listing 9.3 

In [8]:
#Sometimes fails on first execution
#Fits a model to the input dataset with optional parameters.
def order_preferences(prefs):
    return prefs.orderBy(col("userIndex").asc(),
                         col("rating").desc(),
                         col("product").asc())

def strings_to_indexes(ratings, user_indexer,
                       product_indexer):
    transformed = product_indexer.transform(user_indexer.transform(ratings))
    return order_preferences(transformed)

def indexes_to_strings(ratings, user_indexer,
                       product_indexer):
    user_converter = IndexToString(inputCol="userIndex",
                                       outputCol="user",
                             labels=user_indexer.labels)
    product_converter = IndexToString(inputCol="productIndex",
                                          outputCol="product",
                                labels=product_indexer.labels)
    converted = user_converter.transform(
        product_converter.transform(ratings))
    return order_preferences(converted)

user_indexer = StringIndexer(inputCol="user", 
       outputCol="userIndex").fit(user_prefs)
product_indexer = StringIndexer(inputCol="product",
                          outputCol="productIndex").fit(user_prefs)

indexed_prefs = strings_to_indexes(user_prefs, user_indexer, product_indexer)
indexed_prefs.show(10)

+-------+------------+------+---------+------------+
|   user|     product|rating|userIndex|productIndex|
+-------+------------+------+---------+------------+
|u159789|008888345435|     1|      0.0|      5073.0|
|u159789|014633196870|     1|      0.0|      4525.0|
|u159789|018713571687|     1|      0.0|     10355.0|
|u159789|024543718710|     1|      0.0|       263.0|
|u159789|025192979620|     1|      0.0|     12289.0|
|u159789|025193102324|     1|      0.0|      9650.0|
|u159789|085391163121|     1|      0.0|      9196.0|
|u159789|720616236029|     1|      0.0|      2781.0|
|u159789|801213001996|     1|      0.0|     28736.0|
|u159789|813985010007|     1|      0.0|      5819.0|
+-------+------------+------+---------+------------+
only showing top 10 rows



## Listing 9.4

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

random.seed(0)

als = ALS(maxIter=3, rank=10, regParam=0.15, implicitPrefs=True,
          userCol="userIndex", itemCol="productIndex", ratingCol="rating",
          coldStartStrategy="drop", seed=0)

(training_data, test_data) = user_prefs.randomSplit([0.95, 0.05], 0)
training_data = strings_to_indexes(training_data, user_indexer, product_indexer)
test_data = strings_to_indexes(test_data, user_indexer, product_indexer)

print("Beginning model training")
model = als.fit(training_data)

print("Beginning predictions")
predictions = model.transform(test_data)

print("Beginning evaluation")
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error = {rmse}")

Beginning model training
Beginning predictions
Beginning evaluation
Root-mean-square error = 1.0007877733299877


## Listing 9.5

In [10]:
# Generate top 10 product recommendations for each user
indexed_user_recs = model.recommendForAllUsers(10) \
                         .orderBy(col("userIndex").asc())
indexed_user_recs.show(5, truncate=64)

+---------+----------------------------------------------------------------+
|userIndex|                                                 recommendations|
+---------+----------------------------------------------------------------+
|        0|[{6, 0.022541389}, {13, 0.015104328}, {36, 0.010634022}, {20,...|
|        1|[{13, 0.009001873}, {3, 0.007981183}, {23, 0.0050935573}, {31...|
|        2|[{9, 0.06319133}, {17, 0.04681776}, {3, 0.041046627}, {14, 0....|
|        3|[{17, 0.0145240165}, {14, 0.01413305}, {12, 0.012459144}, {39...|
|        4|[{14, 0.006752351}, {4, 0.004651022}, {10, 0.004487163}, {17,...|
+---------+----------------------------------------------------------------+
only showing top 5 rows



## Listing 9.6

In [11]:
column_exploder = explode("recommendations").alias("productIndex_rating")
user_item_recs = indexed_user_recs.select("userIndex", column_exploder) \
                      .select("userIndex", col("productIndex_rating.*"))
user_item_recs = indexes_to_strings(user_item_recs, user_indexer,
                                    product_indexer)
user_item_recs = user_item_recs.select("user", "product",
                                       col("rating").alias("boost"))

# Listing 9.7

In [12]:
print("Start Time: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

recs_collection = engine.create_collection("user_item_recommendations")
recs_collection.write(user_item_recs)

print("End Time: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

Start Time: 2024-11-18 19:33:11
Wiping "user_item_recommendations" collection
Creating "user_item_recommendations" collection
Successfully written 5212070 documents
End Time: 2024-11-18 19:35:46


# Search with Recommendations Boosts
Whereas signals boosting boosts the most popular documents for a particular query (ch8), you can also boost the most personalized items for a particular user. In order to serve up the pre-generated collaborative recommendations we just generated, we can just need to run a search and boost the recommended items for each user.

## Listing 9.8

In [43]:
from pyspark.sql.functions import date_format, lit
from pyspark.sql.types import StructType, StructField, StringType, DateType

def scrub(name): 
    return name.replace("&#xAE;","®").replace("&#x2122;", "™")

def print_interaction_history(user_id, signals):
    products_collection = engine.get_collection("products")    
    interacted_products = [s["target"] for s in signals]
    request = {"filters": [("upc", interacted_products)]}
    products = products_collection.search(**request)["docs"]    
    product_info = {p["upc"]: p["name"] for p in products}
    
    is_epoch_format = isinstance(signals[0]["signal_time"], int)
    for s in signals:        
        if is_epoch_format:
            s["signal_time"] = datetime.now()
        s |= {"name": scrub(product_info.get(s["target"], s["target"]))}

    schema = StructType([StructField("signal_time", DateType(), True),
                         StructField("type", StringType(), True),
                         StructField("target", StringType(), True),
                         StructField("name", StringType(), True)])
    dataframe = spark.createDataFrame(signals, schema)
    dataframe = dataframe.select(date_format('signal_time', 'MM/dd HH:MM').alias("signal_time"),
                                 "type", "target", "name")
    
    print(f"Previous Product Interactions for User: {user_id}")
    dataframe.show(10, truncate=37)

In [44]:
def signals_request(user_id):
    return {"query": "*",
            "return_fields": ["signal_time", "type", "target"],
            "order_by": [("signal_time", "asc")],
            "filters": [("user", user_id)]}

user_id = "u478462" #example user
signals_collection = engine.get_collection("signals")

request = signals_request(user_id)
previous_signals = signals_collection.search(**request)["docs"]
print_interaction_history(user_id, previous_signals)

Previous Product Interactions for User: u478462
+-----------+-----------+------------+-------------------------------------+
|signal_time|       type|      target|                                 name|
+-----------+-----------+------------+-------------------------------------+
|11/18 00:11|      query|       apple|                                apple|
|11/18 00:11|      click|885909457588|Apple® - iPad® 2 with Wi-Fi - 16GB...|
|11/18 00:11|add-to-cart|885909457588|Apple® - iPad® 2 with Wi-Fi - 16GB...|
|11/18 00:11|   purchase|885909457588|Apple® - iPad® 2 with Wi-Fi - 16GB...|
|11/18 00:11|      query|     macbook|                              macbook|
|11/18 00:11|      click|885909464043|Apple® - MacBook® Air - Intel® Cor...|
+-----------+-----------+------------+-------------------------------------+



## Listing 9.9

In [16]:
# %load -s product_search_request engine/search_requests
def product_search_request(query, param_overrides={}):
    request = {"query": query,
               "query_fields": ["name", "manufacturer", "long_description"],
               "return_fields": ["upc", "name", "manufacturer",
                                 "short_description", "score"],
               "limit": 5,
               "order_by": [("score", "desc"), ("upc", "asc")]}
    return request | param_overrides

In [17]:
def get_query_time_boosts(user, boosts_collection):
    request = {"query": "*",
               "return_fields": ["product", "boost"],
               "filters": [("user", user)] if user else [],
               "limit": 10,
               "order_by": [("boost", "desc")]}
    
    response = boosts_collection.search(**request)
    signals_boosts = response["docs"]
    return " ".join(f'"{b["product"]}"^{b["boost"] * 100}'
                    for b in signals_boosts)

def search_for_products(query, signals_boosts):
    request = product_search_request(query if query else "*")
    if signals_boosts:
        request["query_boosts"] = ("upc", signals_boosts)
    return products_collection.search(**request)

In [18]:
user = "u478462"
boosts = get_query_time_boosts(user, recs_collection)
response = search_for_products("", boosts)

print(f"Boost Query:\n{boosts}")
display_product_search("", response["docs"])

Boost Query:
"885909457588"^83.317953 "022265004289"^19.800967 "024543742180"^8.756707 "635753493559"^6.914275 "045496880484"^6.1463382 "635753493573"^5.834811 "885909457595"^5.7118796 "885370315080"^5.6894064 "612572171585"^5.5108927 "885909395095"^5.2595586


## Listing 9.10

In [19]:
query = "tablet"
response = search_for_products(query, None)
print(f"Non-personalized Query")
display_product_search(query, response["docs"])

Non-personalized Query


In [20]:
response = search_for_products(query, boosts)
print(f"Personalized Query")
display_product_search(query, response["docs"])

Personalized Query


Up next: [Vector-based Peronsalization](2.embedding-based-personalization.ipynb)