# [ Chapter 4 - Crowdsourced Relevance ] 
# Setting up the Retrotech Dataset

In [None]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch4-signals-boosting").getOrCreate()

## Download the Retrotech (Ecommerce) Products + Signals Dataset

In [None]:
#Get datasets
![ ! -d 'retrotech' ] && git clone --depth 1 https://github.com/ai-powered-search/retrotech.git
! cd retrotech && git pull
! cd retrotech && tar -xvf products.tgz -C '../../data/retrotech/' && tar -xvf signals.tgz -C '../../data/retrotech/'


## Get a Feel for the Product Catalog

### Listing 4.1

In [None]:
! cd ../data/retrotech/ && head products.csv

## Index the Products into the Search Engine

### Listing 4.2

In [None]:
#Create Products Collection
products_collection="products"
create_collection(products_collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(products_collection, "upc")
upsert_text_field(products_collection, "name")
upsert_text_field(products_collection, "longDescription")
upsert_text_field(products_collection, "manufacturer")

print("Loading Products...")
csvFile = "../data/retrotech/products.csv"
csvDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csvFile)
print("Products Schema: ")
csvDF.printSchema()

product_update_opts={"zkhost": "aips-zk", "collection": products_collection, 
                     "gen_uniq_key": "true", "commit_within": "5000"}
csvDF.write.format("solr").options(**product_update_opts).mode("overwrite").save()
print("Status: Success")

## Verify Searches Work

### Listing 4.3

In [None]:
query = "ipod"

collection = "products"
request = {
    "query": query,
    "fields": ["upc", "name", "manufacturer", "score"],
    "limit": 5,
    "params": {
      "qf": "name manufacturer longDescription",
      "defType": "edismax",
      "sort": "score desc, upc asc"
    }
}

search_results = requests.post(f"{SOLR_URL}/{collection}/select", json=request).json()["response"]["docs"]
display(HTML(render_search_results(query, search_results)))

## Get a Feel for the Signals Data

In [None]:
! cd ../data/retrotech && head signals.csv

## Index the Signals into the Search Engine

### Listing 4.4

In [None]:
#Create Signals Collection
signals_collection="signals"
create_collection(signals_collection)

print("Loading Signals...")
csvFile = "../data/retrotech/signals.csv"
csvDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csvFile)
print("Signals Schema: ")
csvDF.printSchema()

signals_update_opts={"zkhost": "aips-zk", "collection": signals_collection, 
                     "gen_uniq_key": "true", "commit_within": "5000"}
csvDF.write.format("solr").options(**signals_update_opts).mode("overwrite").save()
print("Status: Success")

## Success!

You have now indexed the RetroTech product catalog and signals into the search engine, and run a sample query against the product collection. The results don't look very relevant using the out of the box keyword scoring function, of course, but we'll be working to improve that throughout the rest of this book!

In the next section, we'll take a look at our first crowd-sourced AI-powered search technique: [Signals Boosting](2.signals-boosting.ipynb). 