# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [1]:
import sys

sys.path.append("..")
import requests
from aips.environment import SOLR_URL
from aips import get_engine
from pyspark.sql import SparkSession
import aips.data_loaders.reviews as reviews
import aips.data_loaders.cities as cities
from aips.spark.dataframe import from_csv

spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [2]:
#Get datasets
![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../../data/reviews/' && tar -xvf reviews.tgz -C '../../data/reviews/' && tar -xvf entities.tgz -C '../../data/reviews/' && tar -xvf cities.tgz -C '../../data/reviews/'

Cloning into 'reviews'...
remote: Enumerating objects: 5, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 5 (delta 0), reused 5 (delta 0), pack-reused 0[K
Receiving objects: 100% (5/5), 91.74 MiB | 3.24 MiB/s, done.
Already up to date.
._reviews.csv
reviews.csv
entities.csv
._cities.csv
cities.csv


## Reviews Dataset

### Listing 7.1
### Loading and indexing the reviews dataset

In [3]:
reviews_collection = engine.create_collection("reviews")
reviews_data = reviews.load_dataframe("../data/reviews/reviews.csv")
reviews_collection.write(reviews_data)

Wiping "reviews" collection
Creating "reviews" collection
Status: Success

Loading Reviews...
root
 |-- id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- content: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- stars_rating: integer (nullable = true)
 |-- location_coordinates: string (nullable = true)



## Enities Dataset (Manually-specified Knowledge Graph)

### Listing 7.5

In [4]:
# %load -n SolrEngine.add_tag_field_type
def add_tag_field_type(self, collection):
    request = {
        "add-field-type": {
            "name": "tag",
            "class": "solr.TextField",
            "postingsFormat": "FST50",
            "omitNorms": "true",
            "omitTermFreqAndPositions": "true",
            "indexAnalyzer": {
                "tokenizer": {
                    "class": "solr.StandardTokenizerFactory"},
                "filters": [
                    {"class": "solr.EnglishPossessiveFilterFactory"},
                    {"class": "solr.ASCIIFoldingFilterFactory"},
                    {"class": "solr.LowerCaseFilterFactory"},
                    {"class": "solr.ConcatenateGraphFilterFactory", "preservePositionIncrements": "false"}
                ]},
            "queryAnalyzer": {
                "tokenizer": {
                    "class": "solr.StandardTokenizerFactory"},
                "filters": [
                    {"class": "solr.EnglishPossessiveFilterFactory"},
                    {"class": "solr.ASCIIFoldingFilterFactory"},
                    {"class": "solr.LowerCaseFilterFactory"}
                ]}
            }
        }
    print(f"{SOLR_URL}/{collection.name}/schema")
    print(requests.post(f"{SOLR_URL}/{collection.name}/schema", json=request).text)

In [5]:
# %load -n SolrEngine.add_tag_request_handler
def add_tag_request_handler(self, collection, request_name, field):
    request = {
        "add-requesthandler" : {
            "name": request_name,
            "class": "solr.TaggerRequestHandler",
            "defaults": {
                "field": field,
                "json.nl": "map",
                "sort": "popularity desc",
                "matchText": "true",
                "fl": "id,canonical_form,type,semantic_function,popularity,country,admin_area,location_coordinates"
            }
        }
    }
    return requests.post(f"{SOLR_URL}/{collection.name}/config", json=request)

### Listing 7.3

In [6]:
def display_entities(dataframe, limit=10):
    print("Entities")
    dataframe.drop("semantic_function").show(limit, truncate=20)

def display_semantic_function_entities(dataframe, limit=10):
    print("... Entities continued")
    dataframe.filter(dataframe.type == "semantic_function") \
        .select("id", "semantic_function").show(truncate=False)

In [7]:
entities_dataframe = from_csv("../data/reviews/entities.csv")
display_entities(entities_dataframe, limit=20)
display_semantic_function_entities(entities_dataframe)

Loading ../data/reviews/entities.csv
Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)

Entities
+---+--------------------+--------------------+-----------------+----------+
| id|        surface_form|      canonical_form|             type|popularity|
+---+--------------------+--------------------+-----------------+----------+
|  1|                near| {location_distance}|semantic_function|        90|
|  2|                  in| {location_distance}|semantic_function|       100|
|  3|                  by| {location_distance}|semantic_function|        90|
|  4|                  by|{text_within_one_...|semantic_function|        10|
|  5|                near|     {text_distance}|semantic_function|        10|
|  6|             popular|           {popular}|semantic_function|     

# Cities Dataset (Geonames)

### Listing 7.4

In [9]:
entites_collection = engine.create_collection("entities")
entities_dataframe = from_csv("../data/reviews/entities.csv")
cities_dataframe = cities.load_dataframe("../data/reviews/cities.csv")
entites_collection.write(entities_dataframe)
entites_collection.write(cities_dataframe)

Wiping "entities" collection
Creating "entities" collection
Status: Success
Loading ../data/reviews/entities.csv
Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- semantic_function: string (nullable = true)

Loading Geonames...


## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)