# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]
# Setting up the Reviews Dataset

In [1]:
import aips.data_loaders.cities as cities
import aips.data_loaders.reviews as reviews
from aips import get_engine
from aips.spark import get_spark_session
from aips.spark.dataframe import from_csv

spark = get_spark_session()

engine = get_engine()

## Download the Datasets

In [2]:
#Get datasets
![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../data/reviews/' && tar -xvf reviews.tgz -C '../data/reviews/' && tar -xvf entities.tgz -C '../data/reviews/' && tar -xvf cities.tgz -C '../data/reviews/'

Already up to date.
._reviews.csv
reviews.csv
entities.csv
._cities.csv
cities.csv


In [3]:
reviews_collection = get_engine("solr").create_collection("reviews")
reviews_data = reviews.load_dataframe("data/reviews/reviews.csv")
reviews_collection.write(reviews_data)

entities_collection =  get_engine("solr").create_collection("entities")
entities_dataframe = from_csv("data/reviews/entities.csv")
cities_dataframe = cities.load_dataframe("data/reviews/cities.csv")
entities_collection.write(entities_dataframe)
entities_collection.write(cities_dataframe)

Wiping "reviews" collection
Creating "reviews" collection
Status: Success

Loading Reviews...


## Reviews Dataset

### Listing 7.1
### Loading and indexing the reviews dataset

In [4]:
reviews_collection = engine.create_collection("reviews", log=True)
reviews_data = reviews.load_dataframe("data/reviews/reviews.csv")
reviews_collection.write(reviews_data)

Wiping "reviews" collection
Creating "reviews" collection
Schema: {
  "class": "reviews",
  "properties": [
    {
      "name": "__id",
      "dataType": [
        "text"
      ]
    },
    {
      "name": "content",
      "dataType": [
        "text"
      ],
      "fielddata": true
    },
    {
      "name": "categories",
      "dataType": [
        "text"
      ],
      "copy_to": "doc_type",
      "fielddata": true
    },
    {
      "name": "doc_type",
      "dataType": [
        "text"
      ],
      "fielddata": true
    },
    {
      "name": "stars_rating",
      "dataType": [
        "int"
      ]
    },
    {
      "name": "city",
      "dataType": [
        "text"
      ],
      "fielddata": true
    },
    {
      "name": "state",
      "dataType": [
        "text"
      ],
      "fielddata": true
    },
    {
      "name": "business_name",
      "dataType": [
        "text"
      ],
      "fielddata": true
    },
    {
      "name": "name",
      "dataType": [
        "te

AnalysisException: Cannot write incompatible data to table 'Reviews':
- Cannot find data for output column 'doc_type'

### Listing 7.2 and Figure 7.2 - 7.7

#### Located in the [Semantic Search Application](2.semantic-search.ipynb#listing-7.2) notebook

### Listing 7.3
<a id='listing-7.3'></a>

In [7]:
def display_entities(dataframe, limit=10):
    print("Entities")
    dataframe.drop("semantic_function").show(limit, truncate=20)
    print("... Entities continued")
    dataframe.filter(dataframe.type == "semantic_function") \
        .select("id", "semantic_function").show(truncate=False)

In [None]:
entities_dataframe = from_csv("data/reviews/entities.csv", log=False)
display_entities(entities_dataframe, limit=20)

# Cities Dataset (Geonames)

### Listing 7.4
<a id='listing-7.4'></a>

In [None]:
entities_collection = engine.create_collection("entities")
entities_dataframe = from_csv("data/reviews/entities.csv")
cities_dataframe = cities.load_dataframe("data/reviews/cities.csv")
entities_collection.write(entities_dataframe)
entities_collection.write(cities_dataframe)

In [None]:
print(type(engine))

## Enities Dataset (Manually-specified Knowledge Graph)

### Listing 7.5
<a id='listing-7.5'></a>

In [10]:
# %load -s add_tag_type_commands,add_rag_request_handler_config engines/solr/SolrEngine.py
add_tag_type_commands = [{
    "add-field-type": {
        "name": "tag",
        "class": "solr.TextField",
        "postingsFormat": "FST50",
        "omitNorms": "true",
        "omitTermFreqAndPositions": "true",
        "indexAnalyzer": {
            "tokenizer": {"class": "solr.StandardTokenizerFactory"},
            "filters": [
                {"class": "solr.EnglishPossessiveFilterFactory"},
                {"class": "solr.ASCIIFoldingFilterFactory"},
                {"class": "solr.LowerCaseFilterFactory"},
                {"class": "solr.ConcatenateGraphFilterFactory",
                 "preservePositionIncrements": "false"}]},
        "queryAnalyzer": {
            "tokenizer": {"class": "solr.StandardTokenizerFactory"},
            "filters": [{"class": "solr.EnglishPossessiveFilterFactory"},
                        {"class": "solr.ASCIIFoldingFilterFactory"},
                        {"class": "solr.LowerCaseFilterFactory"}]}}
    },
    {"add-field": {"name": "name_tag", "type": "tag",
                   "stored": "false"}},
    {"add-copy-field": {"source": "surface_form",
                        "dest": ["name_tag"]}}]

add_tag_request_handler_config = {
        "add-requesthandler" : {
            "name": "/tag",
            "class": "solr.TaggerRequestHandler",
            "defaults": {
                "field": "name_tag",
                "json.nl": "map",
                "sort": "popularity desc",
                "matchText": "true",
                "fl": "id,canonical_form,surface_form,type,semantic_function,popularity,country,admin_area,location_coordinates"
            }
        }
    }

## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)