In [1]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch4-getting-started-retrotech").getOrCreate()

In [2]:
#Get datasets
![ ! -d 'retrotech' ] && git clone https://github.com/ai-powered-search/retrotech.git
! cd retrotech && git pull
! cd retrotech && tar -xvf products.tgz -C '../../data/retrotech/' && tar -xvf signals.tgz -C '../../data/retrotech/'

Cloning into 'retrotech'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 5 (delta 0), reused 3 (delta 0), pack-reused 2[K
Unpacking objects: 100% (5/5), done.
Already up to date.
products.csv
signals.csv


In [3]:
! cd ../data/retrotech/ && head products.csv

"upc","name","manufacturer","shortDescription","longDescription"
"096009010836","Fists of Bruce Lee - Dolby - DVD",\N,\N,\N
"043396061965","The Professional - Widescreen Uncut - DVD",\N,\N,\N
"085391862024","Pokemon the Movie: 2000 - DVD",\N,\N,\N
"067003016025","Summerbreeze - CD","Nettwerk",\N,\N
"731454813822","Back for the First Time [PA] - CD","Def Jam South",\N,\N
"024543008200","Big Momma's House - Widescreen - DVD",\N,\N,\N
"031398751823","Kids - DVD",\N,\N,\N
"037628413929","20 Grandes Exitos - CD","Sony Discos Inc.",\N,\N
"060768972223","Power Of Trinity (Box) - CD","Sanctuary Records",\N,\N


In [4]:
#Create Products Collection
products_collection="products"
create_collection(products_collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(products_collection, "upc")
upsert_text_field(products_collection, "name")
upsert_text_field(products_collection, "longDescription")
upsert_text_field(products_collection, "manufacturer")

print("Loading Products...")
csvFile = "../data/retrotech/products.csv"
product_update_opts={"zkhost": "aips-zk", "collection": products_collection, "gen_uniq_key": "true", "commit_within": "5000"}
csvDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(csvFile)
csvDF.write.format("solr").options(**product_update_opts).mode("overwrite").save()
print("Products Schema: ")
csvDF.printSchema()
print("Status: Success")

Wiping 'products' collection
[('action', 'CREATE'), ('name', 'products'), ('numShards', 1), ('replicationFactor', 1)]
Creating products' collection
Status: Success
Adding 'upc' field to collection
Status: Success
Adding 'name' field to collection
Status: Success
Adding 'longDescription' field to collection
Status: Success
Adding 'manufacturer' field to collection
Status: Success
Loading Products...
Products Schema: 
root
 |-- upc: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- shortDescription: string (nullable = true)
 |-- longDescription: string (nullable = true)

Status: Success


In [5]:
query = "ipod"

collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 5,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "sort": "score desc, upc asc"
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))

In [6]:
! cd ../data/retrotech && head signals.csv

"query_id","user","type","target","signal_time"
"u2_0_1","u2","query","nook","2019-07-31 08:49:07.3116"
"u2_1_2","u2","query","rca","2020-05-04 08:28:21.1848"
"u3_0_1","u3","query","macbook","2019-12-22 00:07:07.0152"
"u4_0_1","u4","query","Tv antenna","2019-08-22 23:45:54.1030"
"u5_0_1","u5","query","AC power cord","2019-10-20 08:27:00.1600"
"u6_0_1","u6","query","Watch The Throne","2019-09-18 11:59:53.7470"
"u7_0_1","u7","query","Camcorder","2020-02-25 13:02:29.3089"
"u9_0_1","u9","query","wireless headphones","2020-04-26 04:26:09.7198"
"u10_0_1","u10","query","Xbox","2019-09-13 16:26:12.0132"


In [11]:
import pandas as pd
signals = pd.read_csv('../data/retrotech/signals.csv')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1660e3a1d0>

# Gather query events occuring above a certain threshold

In [106]:
queries = signals[signals['type'] == 'query']
popular_queries = queries.groupby('target').count().sort_values('query_id', ascending=False)
popular_queries = popular_queries[popular_queries['query_id'] > 100].index.to_list()
pop_query_events = signals[signals['type'] == 'query'][signals['target'].isin(popular_queries)]
query_events = pop_query_events[['query_id', 'target']].rename(columns={'target': 'query'})

# Cleanup by lowercasing
# This step has its pros and cons. We might miss some info that case gives us, but it also
# aggregates more signal per query
query_events['query'] = query_events['query'].apply(lambda q: q.lower())
query_events

  after removing the cwd from sys.path.


Unnamed: 0,query_id,query
0,u2_0_1,nook
1,u2_1_2,rca
2,u3_0_1,macbook
3,u4_0_1,tv antenna
6,u7_0_1,camcorder
...,...,...
725452,u744327_0_1,dre beats
725453,u744328_0_1,sirius radio
725455,u744330_0_1,usb drive
725456,u744331_0_1,iphone 4s


# Merge in click events corresponding to queries into one table

In [134]:
clicks = signals[signals['type'] == 'click']
click_events = clicks[['query_id', 'target']].rename(columns={'target': 'doc_id'})
clicks_per_query = query_events.merge(click_events, 
                                      on='query_id', 
                                      how='left')
clicks_per_query['doc_id'] = clicks_per_query['doc_id'].fillna(0)
# clicks_per_query.groupby('query').count().sort_values('query_id', ascending=False)

clicks_per_query

Unnamed: 0,query_id,query,doc_id
0,u2_0_1,nook,9781400532650
1,u2_1_2,rca,883393001119
2,u3_0_1,macbook,885909464036
3,u4_0_1,tv antenna,079000334835
4,u7_0_1,camcorder,027242821866
...,...,...,...
488539,u744327_0_1,dre beats,848447000135
488540,u744328_0_1,sirius radio,884720004032
488541,u744330_0_1,usb drive,718037770604
488542,u744331_0_1,iphone 4s,885909538027


# Compute a CTR for a doc in a query

In [165]:
denominator = query_events.groupby('query').count().rename(columns={'query_id': 'tot_query_count'})

click_thru_rate = clicks_per_query.groupby(['query', 'doc_id']).count().rename(columns={'query_id':'click_count'}).reset_index()

#clicks_per_query.groupby(['query', 'doc_id']).sum()
click_thru_rate = click_thru_rate.merge(denominator, on='query', how='left')

click_thru_rate['ctr'] = click_thru_rate['click_count'] / click_thru_rate['tot_query_count']
click_thru_rate = click_thru_rate.sort_values(['query', 'ctr'], ascending=[True, False])

click_thru_rate[click_thru_rate['query'] == 'kindle'].head(20)

Unnamed: 0,query,doc_id,click_count,tot_query_count,ctr
33702,kindle,814916011872,631,2742,0.230124
33757,kindle,814916014361,527,2742,0.192195
33703,kindle,814916011896,450,2742,0.164114
33706,kindle,814916014606,140,2742,0.051058
33696,kindle,814916010202,130,2742,0.047411
33698,kindle,814916010233,113,2742,0.041211
33699,kindle,814916010240,105,2742,0.038293
33697,kindle,814916010219,100,2742,0.03647
33704,kindle,814916014385,97,2742,0.035376
33705,kindle,814916014590,63,2742,0.022976


In [72]:
query = "ipod"

collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 50,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "sort": "score desc, upc asc"
    }
}

search_results = requests.post(solr_url + collection + "/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))