
# Related keywords detection

In [19]:
from aips import get_engine
from aips.spark import create_view_from_collection, get_spark_session
import aips.indexer

spark = get_spark_session()
engine = get_engine()
signals_collection = aips.indexer.build_collection(engine, "signals")

Wiping "signals" collection
Creating "signals" collection
Status: Success
Loading data/retrotech/signals.csv
Schema: 
root
 |-- query_id: string (nullable = true)
 |-- user: string (nullable = true)
 |-- type: string (nullable = true)
 |-- target: string (nullable = true)
 |-- signal_time: timestamp (nullable = true)

Successfully written 2172605 documents


### Step 1: Prepare the data using py-spark and data frames 


## Listing 6.5

In [20]:
def print_keyword_user_pairs():
    pairs_dataframe = spark.sql("""SELECT * FROM user_searches
                                   ORDER BY user ASC, keyword ASC""")
    print("Number of keyword user pairs:", pairs_dataframe.count())
    print("\nKeyword user pairs derived from signals:")
    for pair in pairs_dataframe.head(3):
        print(f'User "{pair["user"]}" searched for "{pair["keyword"]}"')        

In [21]:
signals_collection = engine.get_collection("signals")
create_view_from_collection(signals_collection, "signals")
query = """SELECT LOWER(searches.target) AS keyword, searches.user
           FROM signals AS searches
           WHERE searches.type='query'"""
spark.sql(query).createOrReplaceTempView("user_searches")
print_keyword_user_pairs()

Number of keyword user pairs: 725459

Keyword user pairs derived from signals:
User "u10" searched for "joy stick"
User "u10" searched for "xbox"
User "u10" searched for "xbox360"


### Step2 : Create Cooccurrence & PMI2  Model based on users searchs

## Listing 6.6

In [22]:
def print_keyword_cooccurrences():
    keyword_searches = spark.sql("""SELECT * FROM keywords_users_oc
                                    ORDER BY users_occ DESC""")
    keyword_searches.show(10)
    keyword_pair_searches = spark.sql("""SELECT * FROM keywords_users_cooc
                                         ORDER BY users_cooc DESC, keyword1 ASC""")
    print("Number of co-occurring keyword searches:", keyword_pair_searches.count(), "\n")
    keyword_pair_searches.show(10)

In [23]:
query = """SELECT k1.keyword AS keyword1, k2.keyword AS keyword2,
           COUNT(DISTINCT k1.user) users_cooc
           FROM user_searches k1
           JOIN user_searches k2 ON k1.user = k2.user
           WHERE k1.keyword > k2.keyword
           GROUP BY k1.keyword, k2.keyword"""
spark.sql(query).createOrReplaceTempView("keywords_users_cooc")

query = """SELECT keyword, COUNT(DISTINCT user) users_occ FROM
           user_searches GROUP BY keyword"""
spark.sql(query).createOrReplaceTempView("keywords_users_oc")
print_keyword_cooccurrences()

+-----------+---------+
|    keyword|users_occ|
+-----------+---------+
|     lcd tv|     8449|
|       ipad|     7749|
|hp touchpad|     7144|
|  iphone 4s|     4642|
|   touchpad|     4019|
|     laptop|     3625|
|    laptops|     3435|
|      beats|     3282|
|       ipod|     3164|
| ipod touch|     2992|
+-----------+---------+
only showing top 10 rows

Number of co-occurring keyword searches: 244876 

+-------------+---------------+----------+
|     keyword1|       keyword2|users_cooc|
+-------------+---------------+----------+
|green lantern|captain america|        23|
|    iphone 4s|         iphone|        21|
|       laptop|      hp laptop|        20|
|         thor|captain america|        18|
|         bose|          beats|        17|
|    iphone 4s|       iphone 4|        17|
|   skullcandy|          beats|        17|
|      laptops|         laptop|        16|
|      macbook|            mac|        16|
|         thor|  green lantern|        16|
+-------------+--------------

## Listing 6.7

In [24]:
query = """
SELECT k1.keyword AS k1, k2.keyword AS k2, k1_k2.users_cooc,
k1.users_occ AS n_users1, k2.users_occ AS n_users2,
LOG(POW(k1_k2.users_cooc, 2) /
    (k1.users_occ * k2.users_occ)) AS pmi2
FROM keywords_users_cooc AS k1_k2 
JOIN keywords_users_oc AS k1 ON k1_k2.keyword1 = k1.keyword
JOIN keywords_users_oc AS k2 ON k1_k2.keyword2 = k2.keyword"""
spark.sql(query).createOrReplaceTempView("user_related_keywords_pmi")

In [25]:
spark.sql("""SELECT k1, k2, users_cooc, n_users1,
                    n_users2, ROUND(pmi2, 3) AS pmi2
             FROM user_related_keywords_pmi
             WHERE users_cooc > 5 ORDER BY pmi2 DESC, k1 ASC""").show(10)

+-----------------+--------------------+----------+--------+--------+------+
|               k1|                  k2|users_cooc|n_users1|n_users2|  pmi2|
+-----------------+--------------------+----------+--------+--------+------+
|  iphone 4s cases|      iphone 4 cases|        10|     158|     740|-7.064|
|     sony laptops|          hp laptops|         8|     209|     432|-7.252|
|otterbox iphone 4|            otterbox|         7|     122|     787| -7.58|
|    green lantern|     captain america|        23|     963|    1091|-7.594|
|          kenwood|              alpine|        13|     584|     717|-7.815|
|      sony laptop|         dell laptop|        10|     620|     451|-7.936|
|   wireless mouse|           godfather|         6|     407|     248|-7.939|
|       hp laptops|        dell laptops|         6|     432|     269| -8.08|
|      mp3 players|        dvd recorder|         6|     334|     365|-8.128|
|          quicken|portable dvd players|         6|     281|     434|-8.128|

## Listing 6.8

In [26]:
query = """
SELECT *, (r1 + r2 / (r1 * r2)) / 2 AS comp_score
FROM (
  SELECT *, 
  RANK() OVER (PARTITION BY 1
               ORDER BY users_cooc DESC) r1,
  RANK() OVER (PARTITION BY 1
               ORDER BY pmi2 DESC) r2  
  FROM user_related_keywords_pmi)"""
spark.sql(query).createOrReplaceTempView("users_related_keywords_comp_score")

In [27]:
spark.sql("""SELECT k1, k2, users_cooc, ROUND(pmi2, 3) as pmi2,
             r1, r2, ROUND(comp_score, 3) as comp_score 
             FROM users_related_keywords_comp_score
             ORDER BY comp_score ASC, pmi2 ASC""").show(20)

+-------------+---------------+----------+-------+---+------+----------+
|           k1|             k2|users_cooc|   pmi2| r1|    r2|comp_score|
+-------------+---------------+----------+-------+---+------+----------+
|green lantern|captain america|        23| -7.594|  1|  8626|       1.0|
|    iphone 4s|         iphone|        21|-10.217|  2| 56156|      1.25|
|       laptop|      hp laptop|        20| -9.133|  3| 20383|     1.667|
|         thor|captain america|        18| -8.483|  4| 13190|     2.125|
|    iphone 4s|       iphone 4|        17|-10.076|  5| 51964|       2.6|
|         bose|          beats|        17|-10.074|  5| 51916|       2.6|
|   skullcandy|          beats|        17| -9.001|  5| 18792|       2.6|
|      laptops|         laptop|        16|-10.792|  8| 80240|     4.063|
|      macbook|            mac|        16| -9.891|  8| 45464|     4.063|
|         thor|  green lantern|        16| -8.594|  8| 14074|     4.063|
|   headphones|   beats by dre|        15| -9.989| 

###  Create Cooccurrence & PMI2  Model based on product interaction

## Listing 6.9

In [28]:
def print_signals_format():
    print("Original signals format: ")
    spark.sql("""SELECT id, query_id, signal_time, target, type, user
                 FROM signals WHERE type = 'query'
                 ORDER BY id ASC""").show(3)
    print("Simplified signals format: ")
    spark.sql("""SELECT * FROM keyword_click_product
                 ORDER BY user ASC, product ASC""").show(3)

In [29]:
query = """SELECT LOWER(searches.target) AS keyword, searches.user AS user,
           clicks.target AS product FROM signals AS searches
           RIGHT JOIN signals AS clicks
           ON searches.query_id = clicks.query_id 
           WHERE searches.type = 'query'
           AND clicks.type = 'click'"""
spark.sql(query).createOrReplaceTempView("keyword_click_product")
print_signals_format()

Original signals format: 
+--------------------+-----------+--------------------+---------+-----+-------+
|                  id|   query_id|         signal_time|   target| type|   user|
+--------------------+-----------+--------------------+---------+-----+-------+
|0000640e-ac35-45b...|u115435_0_1|2019-09-24 15:39:...| Printers|query|u115435|
|00009734-0136-405...|u346324_0_1|2020-02-12 15:16:...|iPhone 4s|query|u346324|
|0000a5cb-6b17-443...| u92998_0_1|2020-01-26 19:19:...|   led tv|query| u92998|
+--------------------+-----------+--------------------+---------+-----+-------+
only showing top 3 rows

Simplified signals format: 
+-------------+----+------------+
|      keyword|user|     product|
+-------------+----+------------+
|    joy stick| u10|097855018120|
|         xbox| u10|885370235876|
|virgin mobile|u100|799366521679|
+-------------+----+------------+
only showing top 3 rows



## Listing 6.10

In [30]:
def print_keyword_pair_data():
    dataframe = spark.sql("""SELECT * FROM keyword_click_product_cooc
                             ORDER BY n_products DESC, users_cooc DESC""")
    print("Number of co-occurring queries:", dataframe.count(), "\n")
    dataframe.show(20)

In [31]:
query = """
SELECT k1.keyword AS k1, k2.keyword AS k2, SUM(p1) n_users1, sum(p2) n_users2,
SUM(p1 + p2) AS users_cooc, COUNT(1) n_products FROM (
  SELECT keyword, product, COUNT(1) AS p1 FROM keyword_click_product
  GROUP BY keyword, product) AS k1 JOIN (
  SELECT keyword, product, COUNT(1) AS p2 FROM keyword_click_product
  GROUP BY keyword, product) AS k2 ON k1.product = k2.product
WHERE k1.keyword > k2.keyword GROUP BY k1.keyword, k2.keyword"""
spark.sql(query).createOrReplaceTempView("keyword_click_product_cooc")
print_keyword_pair_data()

Number of co-occurring queries: 1579710 

+--------------+-------------+--------+--------+----------+----------+
|            k1|           k2|n_users1|n_users2|users_cooc|n_products|
+--------------+-------------+--------+--------+----------+----------+
|       laptops|       laptop|    3251|    3345|      6596|       187|
|       tablets|       tablet|    1510|    1629|      3139|       155|
|        tablet|         ipad|    1468|    7067|      8535|       146|
|       tablets|         ipad|    1359|    7048|      8407|       132|
|       cameras|       camera|     637|     688|      1325|       116|
|          ipad|        apple|    6706|    1129|      7835|       111|
|      iphone 4|       iphone|    1313|    1754|      3067|       108|
|    headphones|  head phones|    1829|     492|      2321|       106|
|        ipad 2|         ipad|    2736|    6738|      9474|        98|
|     computers|     computer|     536|     392|       928|        98|
|iphone 4 cases|iphone 4 case|     

## Listing 6.11

In [32]:
def print_keyword_popularity():
    dataframe = spark.sql("""SELECT * FROM keyword_click_product_oc
                             ORDER BY n_users DESC""")
    print("Keyword searches that resulted in clicks:", dataframe.count(), "\n")
    dataframe.show(20)

In [33]:
query = """SELECT keyword, COUNT(1) AS n_users FROM keyword_click_product
           GROUP BY keyword"""
spark.sql(query).createOrReplaceTempView("keyword_click_product_oc")
print_keyword_popularity()

Keyword searches that resulted in clicks: 13744 

+------------+-------+
|     keyword|n_users|
+------------+-------+
|        ipad|   7554|
| hp touchpad|   4829|
|      lcd tv|   4606|
|   iphone 4s|   4585|
|      laptop|   3554|
|       beats|   3498|
|     laptops|   3369|
|        ipod|   2949|
|  ipod touch|   2931|
|      ipad 2|   2842|
|      kindle|   2833|
|    touchpad|   2785|
|   star wars|   2564|
|      iphone|   2430|
|beats by dre|   2328|
|     macbook|   2313|
|  headphones|   2270|
|        bose|   2071|
|         ps3|   2041|
|         mac|   1851|
+------------+-------+
only showing top 20 rows



In [34]:
# calculate PMI2, per Listing 6.6
spark.sql("""
SELECT k1.keyword AS k1, k2.keyword AS k2, k1_k2.users_cooc,
k1.n_users AS n_users1, k2.n_users AS n_users2,
LOG(POW(k1_k2.users_cooc, 2) /
   (k1.n_users * k2.n_users)) AS pmi2
FROM keyword_click_product_cooc AS k1_k2 
JOIN keyword_click_product_oc AS k1 ON k1_k2.k1 = k1.keyword
JOIN keyword_click_product_oc AS k2 ON k1_k2.k2 = k2.keyword
""").createOrReplaceTempView("product_related_keywords_pmi")

In [35]:
# calculate comp_score, per Listing 6.7
spark.sql("""
SELECT *, (r1 + r2 / (r1 * r2)) / 2 as comp_score from (
  SELECT *, 
    RANK() OVER (PARTITION BY 1 ORDER BY users_cooc DESC) r1, 
    RANK() OVER (PARTITION BY 1 ORDER BY pmi2 DESC) r2  
FROM product_related_keywords_pmi)
""").createOrReplaceTempView("product_related_keywords_comp_score")

## Listing 6.12

In [36]:
#Unstable, sometimes need to re-execute several times
query = """SELECT k1, k2, n_users1, n_users2, ROUND(pmi2, 3) AS pmi2,
           ROUND(comp_score, 3) AS comp_score
           FROM product_related_keywords_comp_score
           ORDER BY comp_score ASC"""
dataframe = spark.sql(query)
print("Number of co-occurring queries:", dataframe.count(), "\n")
dataframe.show(20)

Number of co-occurring queries: 1579710 

+----------+-----------+--------+--------+-----+----------+
|        k1|         k2|n_users1|n_users2| pmi2|comp_score|
+----------+-----------+--------+--------+-----+----------+
|      ipad|hp touchpad|    7554|    4829|1.232|       1.0|
|    ipad 2|       ipad|    2842|    7554|1.431|      1.25|
|    tablet|       ipad|    1818|    7554|1.669|     1.667|
|  touchpad|       ipad|    2785|    7554|1.223|     2.125|
|   tablets|       ipad|    1627|    7554|1.749|       2.6|
|     ipad2|       ipad|    1254|    7554|1.903|     3.083|
|      ipad|      apple|    7554|    1814|  1.5|     3.571|
|  touchpad|hp touchpad|    2785|    4829|1.394|     4.063|
|      ipad|  hp tablet|    7554|    1421|1.594|     4.556|
|ipod touch|       ipad|    2931|    7554|0.863|      5.05|
|      ipad|      i pad|    7554|     612|2.415|     5.545|
|    kindle|       ipad|    2833|    7554|0.828|     6.042|
|    laptop|       ipad|    3554|    7554|0.593|     6.538

Up next: [Misspelling detection and correction](../ch06/3.spell-correction.ipynb)