# [ Chapter 4 - Crowdsourced Relevance ] 
# Setting up the Retrotech Dataset

In [1]:
import sys
sys.path.append('..')
from aips import display_product_search, get_engine
from pyspark.sql import SparkSession
from aips.spark.dataframe import from_csv
engine = get_engine()
spark = SparkSession.builder.appName("AIPS").getOrCreate()

## Download the Retrotech (Ecommerce) Products + Signals Dataset

In [2]:
#Get datasets
![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git
! cd retrotech && git pull
! cd retrotech && mkdir -p '../../data/retrotech/' && tar -xvf products.tgz -C '../../data/retrotech/' && tar -xvf signals.tgz -C '../../data/retrotech/'

Cloning into 'retrotech'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 19 (delta 0), reused 19 (delta 0), pack-reused 0[K
Receiving objects: 100% (19/19), 48.29 MiB | 19.91 MiB/s, done.
Already up to date.
products.csv
signals.csv


## Get a Feel for the Product Catalog

### Listing 4.1

In [3]:
! cd ../data/retrotech/ && head products.csv

"upc","name","manufacturer","shortDescription","longDescription"
"096009010836","Fists of Bruce Lee - Dolby - DVD",\N,\N,\N
"043396061965","The Professional - Widescreen Uncut - DVD",\N,\N,\N
"085391862024","Pokemon the Movie: 2000 - DVD",\N,\N,\N
"067003016025","Summerbreeze - CD","Nettwerk",\N,\N
"731454813822","Back for the First Time [PA] - CD","Def Jam South",\N,\N
"024543008200","Big Momma's House - Widescreen - DVD",\N,\N,\N
"031398751823","Kids - DVD",\N,\N,\N
"037628413929","20 Grandes Exitos - CD","Sony Discos Inc.",\N,\N
"060768972223","Power Of Trinity (Box) - CD","Sanctuary Records",\N,\N


## Index the Products into the Search Engine

### Listing 4.2

In [4]:
from aips.data_loaders.products import load_dataframe

products_collection = engine.create_collection("products")
products_dataframe = load_dataframe("../data/retrotech/products.csv")
products_collection.write(products_dataframe)

Wiping "products" collection
Creating "products" collection
Status: Success
Loading Products
Schema: 
root
 |-- upc: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- long_description: string (nullable = true)
 |-- short_description: string (nullable = true)

Successfully written 48194 documents


## Verify Searches Work

### Listing 4.3

In [5]:
# %load -s product_search_request aips/search_requests
def product_search_request(query, param_overrides={}):
    request = {"query": query,
               "query_fields": ["name", "manufacturer", "long_description"],
               "return_fields": ["upc", "name", "manufacturer", "score"],
               "limit": 5,
               "order_by": [("score", "desc"), ("upc", "asc")]}
    return request | param_overrides

In [6]:
query = "ipod"
products_collection = engine.get_collection("products")
request = product_search_request(query)
response = products_collection.search(**request)
display_product_search(query, response["docs"])

## Get a Feel for the Signals Data

In [7]:
! cd ../data/retrotech && head signals.csv

"query_id","user","type","target","signal_time"
"u2_0_1","u2","query","nook","2019-07-31 08:49:07.3116"
"u2_1_2","u2","query","rca","2020-05-04 08:28:21.1848"
"u3_0_1","u3","query","macbook","2019-12-22 00:07:07.0152"
"u4_0_1","u4","query","Tv antenna","2019-08-22 23:45:54.1030"
"u5_0_1","u5","query","AC power cord","2019-10-20 08:27:00.1600"
"u6_0_1","u6","query","Watch The Throne","2019-09-18 11:59:53.7470"
"u7_0_1","u7","query","Camcorder","2020-02-25 13:02:29.3089"
"u9_0_1","u9","query","wireless headphones","2020-04-26 04:26:09.7198"
"u10_0_1","u10","query","Xbox","2019-09-13 16:26:12.0132"


## Index the Signals into the Search Engine

### Listing 4.4

In [8]:
signals_collection = engine.create_collection("signals")
signals_collection.write(from_csv("../data/retrotech/signals.csv"))

Wiping "signals" collection
Creating "signals" collection
Status: Success
Loading ../data/retrotech/signals.csv
Schema: 
root
 |-- query_id: string (nullable = true)
 |-- user: string (nullable = true)
 |-- type: string (nullable = true)
 |-- target: string (nullable = true)
 |-- signal_time: timestamp (nullable = true)

Successfully written 2172605 documents


## Success!

You have now indexed the RetroTech product catalog and signals into the search engine, and run a sample query against the product collection. The results don't look very relevant using the out of the box keyword scoring function, of course, but we'll be working to improve that throughout the rest of this book!

In the next section, we'll take a look at our first crowd-sourced AI-powered search technique: [Signals Boosting](2.signals-boosting.ipynb). 