# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [None]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [None]:
#Get datasets
![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../data/reviews/' && tar -xvf reviews.tgz -C '../data/reviews/' && tar -xvf entities.tgz -C '../data/reviews/' && tar -xvf cities.tgz -C '../data/reviews/'


## Reviews Dataset

### Listing 7.5

### Indexing

In [None]:
def index_reviews_collection(reviews_collection):
    print("\nLoading Reviews...")
    csvFile = "data/reviews/reviews.csv"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection.name, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.csv(csvFile, inferSchema=True, header=True, multiLine=True, escape="\"") \
        .withColumn("poplarity_i", col("stars_i") * 20) \
        .select(
          "id", "name_t", "city_t", "state_t", "text_t", "stars_i", 
          "categories_t",  "location_pt_s", "type_ss", "latitude_d", "longitude_d")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")

## Index the Reviews Dataset into the Search Engine

### Listing 7.1

In [None]:
reviews_collection = engine.create_collection("reviews")
index_reviews_collection(reviews_collection)

## Enities Dataset (Manually-specified Knowledge Graph)

# Cities Dataset (Geonames)

In [None]:
#Modify Schema to make some fields explicitly searchable by keyword
#upsert_text_field(jobs_collection, "company_country")
#upsert_text_field(jobs_collection, "job_description")
#upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

def index_cities(collection):
    print("Loading Geonames...")
    csvFile = "data/reviews/cities.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": collection.name, 
                          "gen_uniq_key": "true", "commit_within": "5000"}

    from pyspark.sql.types import StructType,StructField, StringType, IntegerType
    from pyspark.sql.functions import concat_ws

    schema = StructType() \
          .add("id",StringType(),True) \
          .add("name",StringType(),True) \
          .add("ascii_name_s",StringType(),True) \
          .add("alternative_names_s",StringType(),True) \
          .add("latitude_s",StringType(),True) \
          .add("longitude_s",StringType(),True) \
          .add("feature_class_s",StringType(),True) \
          .add("feature_code_s",StringType(),True) \
          .add("StringType",StringType(),True) \
          .add("cc2_s",StringType(),True) \
          .add("admin_code_1_s",StringType(),True) \
          .add("admin_code_2_s",StringType(),True) \
          .add("admin_code_3_s",StringType(),True) \
          .add("admin_code_4_s",StringType(),True) \
          .add("population_i",IntegerType(),True) \
          .add("elevation_s",StringType(),True) \
          .add("dem_s",StringType(),True) \
          .add("timezone_s",StringType(),True) \
          .add("modification_date_s",StringType(),True)

    csvDF = spark.read.csv(csvFile, schema=schema, multiLine=True, escape="\\", sep="\t") \
        .withColumn("type", lit("city")) \
        .withColumn("location_p", concat_ws(",", "latitude_s", "longitude_s"))
        #.show()

    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    #print("Entities Schema: ")
    #csvDF.printSchema()
    print("Status: Success")

### Listing 7.4

In [6]:
entitites_collection = engine.create_collection("entities")
entitites_collection.write_from_csv("data/reviews/entities.csv")
index_cities(entitites_collection)

Status: Success


## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)