# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [2]:
#Get datasets
![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../../data/reviews/' && tar -xvf reviews.tgz -C '../../data/reviews/' && tar -xvf entities.tgz -C '../../data/reviews/' && tar -xvf cities.tgz -C '../../data/reviews/'


Already up to date.
._reviews.csv
reviews.csv
entities.csv
._cities.csv
cities.csv


## Reviews Dataset

### Listing 7.5

### Indexing

In [3]:
def index_reviews_collection(reviews_collection):
    print("\nLoading Reviews...")
    csvFile = "../data/reviews/reviews.csv"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection.name, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.csv(csvFile, inferSchema=True, header=True, multiLine=True, escape="\"") \
        .withColumn("poplarity_i", col("stars_i") * 20) \
        .select(
          "id", "name_t", "city_t", "state_t", "text_t", "stars_i", 
          "categories_t",  "location_pt_s", "type_ss", "latitude_d", "longitude_d")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")

## Index the Reviews Dataset into the Search Engine

### Listing 7.1

In [4]:
reviews_collection = engine.create_collection("reviews")
index_reviews_collection(reviews_collection)

Wiping "reviews" collection
Creating "reviews" collection
Status: Success

Loading Reviews...
Reviews Schema: 
root
 |-- id: string (nullable = true)
 |-- name_t: string (nullable = true)
 |-- city_t: string (nullable = true)
 |-- state_t: string (nullable = true)
 |-- text_t: string (nullable = true)
 |-- stars_i: integer (nullable = true)
 |-- categories_t: string (nullable = true)
 |-- location_pt_s: string (nullable = true)
 |-- type_ss: string (nullable = true)
 |-- latitude_d: double (nullable = true)
 |-- longitude_d: double (nullable = true)

Status: Success


## Enities Dataset (Manually-specified Knowledge Graph)

### Listing 7.5

In [5]:
# %load -n SolrEngine.add_tag_field_type
def add_tag_field_type(self, collection):
    request = {
        "add-field-type": {
            "name": "tag",
            "class": "solr.TextField",
            "postingsFormat": "FST50",
            "omitNorms": "true",
            "omitTermFreqAndPositions": "true",
            "indexAnalyzer": {
                "tokenizer": {
                    "class": "solr.StandardTokenizerFactory"},
                "filters": [
                    {"class": "solr.EnglishPossessiveFilterFactory"},
                    {"class": "solr.ASCIIFoldingFilterFactory"},
                    {"class": "solr.LowerCaseFilterFactory"},
                    {"class": "solr.ConcatenateGraphFilterFactory", "preservePositionIncrements": "false"}
                ]},
            "queryAnalyzer": {
                "tokenizer": {
                    "class": "solr.StandardTokenizerFactory"},
                "filters": [
                    {"class": "solr.EnglishPossessiveFilterFactory"},
                    {"class": "solr.ASCIIFoldingFilterFactory"},
                    {"class": "solr.LowerCaseFilterFactory"}
                ]}
            }
        }
    print(f"{SOLR_URL}/{collection.name}/schema")
    print(requests.post(f"{SOLR_URL}/{collection.name}/schema", json=request).text)

In [6]:
# %load -n SolrEngine.add_tag_request_handler
def add_tag_request_handler(self, collection, request_name, field):
    request = {
        "add-requesthandler" : {
            "name": request_name,
            "class": "solr.TaggerRequestHandler",
            "defaults": {"field": field}
        }
    }
    return requests.post(f"{SOLR_URL}/{collection.name}/config", json=request)

# Cities Dataset (Geonames)

In [7]:
#Modify Schema to make some fields explicitly searchable by keyword
#upsert_text_field(jobs_collection, "company_country")
#upsert_text_field(jobs_collection, "job_description")
#upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

def index_cities(collection):
    print("Loading Geonames...")
    csvFile = "../data/reviews/cities.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": collection.name, 
                          "gen_uniq_key": "true", "commit_within": "5000"}

    from pyspark.sql.types import StructType,StructField, StringType, IntegerType
    from pyspark.sql.functions import concat_ws

    schema = StructType() \
          .add("id",StringType(),True) \
          .add("name",StringType(),True) \
          .add("ascii_name_s",StringType(),True) \
          .add("alternative_names_s",StringType(),True) \
          .add("latitude_s",StringType(),True) \
          .add("longitude_s",StringType(),True) \
          .add("feature_class_s",StringType(),True) \
          .add("feature_code_s",StringType(),True) \
          .add("StringType",StringType(),True) \
          .add("cc2_s",StringType(),True) \
          .add("admin_code_1_s",StringType(),True) \
          .add("admin_code_2_s",StringType(),True) \
          .add("admin_code_3_s",StringType(),True) \
          .add("admin_code_4_s",StringType(),True) \
          .add("population_i",IntegerType(),True) \
          .add("elevation_s",StringType(),True) \
          .add("dem_s",StringType(),True) \
          .add("timezone_s",StringType(),True) \
          .add("modification_date_s",StringType(),True)

    csvDF = spark.read.csv(csvFile, schema=schema, multiLine=True, escape="\\", sep="\t") \
        .withColumn("type", lit("city")) \
        .withColumn("location_p", concat_ws(",", "latitude_s", "longitude_s"))
        #.show()

    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    #print("Entities Schema: ")
    #csvDF.printSchema()
    print("Status: Success")

### Listing 7.4

In [8]:
entitites_collection = engine.create_collection("entities")
entitites_collection.write_from_csv("../data/reviews/entities.csv")
index_cities(entitites_collection)

Wiping "entities" collection
Creating "entities" collection
http://aips-solr:8983/solr/entities/schema
{
  "responseHeader":{
    "status":0,
    "QTime":584}}

Status: Success
Loading entities
entities Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)

Status: Success
Loading Geonames...
Status: Success


## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)