# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [6]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [7]:
#Get datasets
![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../data/reviews/' && tar -xvf reviews.tgz -C '../data/reviews/' && tar -xvf entities.tgz -C '../data/reviews/' && tar -xvf cities.tgz -C '../data/reviews/'


Cloning into 'reviews'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 5 (delta 0), pack-reused 0[K
Receiving objects: 100% (5/5), 91.74 MiB | 10.59 MiB/s, done.
Already up to date.
._reviews.csv
reviews.csv
entities.csv
._cities.csv
cities.csv


## Reviews Dataset

### Listing 7.5

### Indexing

In [3]:
def index_reviews_collection(reviews_collection):
    print("\nLoading Reviews...")
    csvFile = "data/reviews/reviews.csv"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection.name, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.csv(csvFile, inferSchema=True, header=True, multiLine=True, escape="\"") \
        .withColumn("poplarity_i", col("stars_i") * 20) \
        .select(
          "id", "name_t", "city_t", "state_t", "text_t", "stars_i", 
          "categories_t",  "location_pt_s", "type_ss", "latitude_d", "longitude_d")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")

## Index the Reviews Dataset into the Search Engine

### Listing 7.1

In [4]:
reviews_collection = engine.create_collection("reviews")
index_reviews_collection(reviews_collection)

Wiping 'reviews' collection
Status: Success
Creating 'reviews' collection
Status: Success

Loading Reviews...
Reviews Schema: 
root
 |-- id: string (nullable = true)
 |-- name_t: string (nullable = true)
 |-- city_t: string (nullable = true)
 |-- state_t: string (nullable = true)
 |-- text_t: string (nullable = true)
 |-- stars_i: integer (nullable = true)
 |-- categories_t: string (nullable = true)
 |-- location_pt_s: string (nullable = true)
 |-- type_ss: string (nullable = true)
 |-- latitude_d: double (nullable = true)
 |-- longitude_d: double (nullable = true)

Status: Success


## Enities Dataset (Manually-specified Knowledge Graph)

## Collection Creation

## Indexing

# Cities Dataset (Geonames)

In [5]:
#Modify Schema to make some fields explicitly searchable by keyword
#upsert_text_field(jobs_collection, "company_country")
#upsert_text_field(jobs_collection, "job_description")
#upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

def index_cities(collection):
    print("Loading Geonames...")
    csvFile = "data/reviews/cities.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": collection.name, 
                          "gen_uniq_key": "true", "commit_within": "5000"}

    from pyspark.sql.types import StructType,StructField, StringType, IntegerType
    from pyspark.sql.functions import concat_ws

    schema = StructType() \
          .add("id",StringType(),True) \
          .add("name",StringType(),True) \
          .add("ascii_name_s",StringType(),True) \
          .add("alternative_names_s",StringType(),True) \
          .add("latitude_s",StringType(),True) \
          .add("longitude_s",StringType(),True) \
          .add("feature_class_s",StringType(),True) \
          .add("feature_code_s",StringType(),True) \
          .add("StringType",StringType(),True) \
          .add("cc2_s",StringType(),True) \
          .add("admin_code_1_s",StringType(),True) \
          .add("admin_code_2_s",StringType(),True) \
          .add("admin_code_3_s",StringType(),True) \
          .add("admin_code_4_s",StringType(),True) \
          .add("population_i",IntegerType(),True) \
          .add("elevation_s",StringType(),True) \
          .add("dem_s",StringType(),True) \
          .add("timezone_s",StringType(),True) \
          .add("modification_date_s",StringType(),True)

    csvDF = spark.read.csv(csvFile, schema=schema, multiLine=True, escape="\\", sep="\t") \
        .withColumn("type", lit("city")) \
        .withColumn("location_p", concat_ws(",", "latitude_s", "longitude_s"))
        #.show()

    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    #print("Entities Schema: ")
    #csvDF.printSchema()
    print("Status: Success")

### Listing 7.4

In [6]:
entitites_collection = engine.create_collection("entities")
entitites_collection.write_from_csv("data/reviews/entities.csv")
index_cities(entitites_collection)

Wiping 'entities' collection
Status: Success
Creating 'entities' collection
Status: Success
Loading entities
entities Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)



Py4JJavaError: An error occurred while calling o130.save.
: java.lang.NullPointerException: Cannot invoke "String.split(String)" because the return value of "com.lucidworks.spark.SolrRelation.collection()" is null
	at com.lucidworks.spark.SolrRelation.dynamicSuffixes$lzycompute(SolrRelation.scala:101)
	at com.lucidworks.spark.SolrRelation.dynamicSuffixes(SolrRelation.scala:97)
	at com.lucidworks.spark.SolrRelation.insert(SolrRelation.scala:658)
	at solr.DefaultSource.createRelation(DefaultSource.scala:29)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)


## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)