# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [1]:
from pyspark.sql import SparkSession

import aips.data_loaders.cities as cities
import aips.data_loaders.reviews as reviews
from aips import get_engine
from aips.spark.dataframe import from_csv
import aips.indexer
spark = SparkSession.builder.appName("AIPS").getOrCreate()

engine = get_engine("solr")
aips.indexer.build_collection(get_engine(), "reviews")

## Reviews Dataset

### Listing 7.1
### Loading and indexing the reviews dataset

In [3]:
reviews_collection = engine.create_collection("reviews")
reviews_data = reviews.load_dataframe("data/reviews/reviews.csv")
reviews_collection.write(reviews_data)

Wiping "reviews" collection
Creating "reviews" collection
Status: Success

Loading Reviews...
root
 |-- id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- content: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- stars_rating: integer (nullable = true)
 |-- location_coordinates: string (nullable = true)

Successfully written 192138 documents


### Listing 7.2 and Figure 7.2 - 7.7

#### Located in the [Semantic Search Application](2.semantic-search.ipynb#listing-7.2) notebook

### Listing 7.3
<a id='listing-7.3'></a>

In [4]:
def display_entities(dataframe, limit=10):
    print("Entities")
    dataframe.drop("semantic_function").show(limit, truncate=20)
    print("... Entities continued")
    dataframe.filter(dataframe.type == "semantic_function") \
        .select("id", "semantic_function").show(truncate=False)

In [5]:
entities_dataframe = from_csv("data/reviews/entities.csv", log=False)
display_entities(entities_dataframe, limit=20)

Entities
+---+--------------------+--------------------+-----------------+----------+
| id|        surface_form|      canonical_form|             type|popularity|
+---+--------------------+--------------------+-----------------+----------+
|  1|                near| {location_distance}|semantic_function|        90|
|  2|                  in| {location_distance}|semantic_function|       100|
|  3|                  by| {location_distance}|semantic_function|        90|
|  4|                  by|{text_within_one_...|semantic_function|        10|
|  5|                near|     {text_distance}|semantic_function|        10|
|  6|             popular|           {popular}|semantic_function|       100|
|  7|                 top|           {popular}|semantic_function|       100|
|  8|                best|           {popular}|semantic_function|       100|
|  9|                good|           {popular}|semantic_function|       100|
| 10|              violet|              violet|            color|  

# Cities Dataset (Geonames)

### Listing 7.4
<a id='listing-7.4'></a>

In [6]:
entities_collection = engine.create_collection("entities")
entities_dataframe = from_csv("data/reviews/entities.csv")
cities_dataframe = cities.load_dataframe("data/reviews/cities.csv")
entities_collection.write(entities_dataframe)
entities_collection.write(cities_dataframe)

Wiping "entities" collection
Creating "entities" collection
Status: Success
Loading data/reviews/entities.csv
Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)

Loading Geonames...
Successfully written 21 documents
Successfully written 137581 documents


## Enities Dataset (Manually-specified Knowledge Graph)

### Listing 7.5
<a id='listing-7.5'></a>

In [7]:
# %load -s add_tag_type_commands,add_rag_request_handler_config engines/solr/SolrEngine.py
add_tag_type_commands = [{
    "add-field-type": {
        "name": "tag",
        "class": "solr.TextField",
        "postingsFormat": "FST50",
        "omitNorms": "true",
        "omitTermFreqAndPositions": "true",
        "indexAnalyzer": {
            "tokenizer": {"class": "solr.StandardTokenizerFactory"},
            "filters": [
                {"class": "solr.EnglishPossessiveFilterFactory"},
                {"class": "solr.ASCIIFoldingFilterFactory"},
                {"class": "solr.LowerCaseFilterFactory"},
                {"class": "solr.ConcatenateGraphFilterFactory",
                 "preservePositionIncrements": "false"}]},
        "queryAnalyzer": {
            "tokenizer": {"class": "solr.StandardTokenizerFactory"},
            "filters": [{"class": "solr.EnglishPossessiveFilterFactory"},
                        {"class": "solr.ASCIIFoldingFilterFactory"},
                        {"class": "solr.LowerCaseFilterFactory"}]}}
    },
    {"add-field": {"name": "name_tag", "type": "tag",
                   "stored": "false"}},
    {"add-copy-field": {"source": "surface_form",
                        "dest": ["name_tag"]}}]

add_tag_request_handler_config = {
        "add-requesthandler" : {
            "name": "/tag",
            "class": "solr.TaggerRequestHandler",
            "defaults": {
                "field": "name_tag",
                "json.nl": "map",
                "sort": "popularity desc",
                "matchText": "true",
                "fl": "id,canonical_form,surface_form,type,semantic_function,popularity,country,admin_area,location_coordinates"
            }
        }
    }

## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)