# Setting up the Knowledge Graph Datasets

In [None]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("ch7").getOrCreate()

## Download the Datasets

In [2]:
#Get datasets
![ ! -d 'reviews' ] && git clone https://github.com/ai-powered-search/reviews.git
! cd reviews && git pull
! cd reviews && mkdir -p '../../data/reviews/' && tar -xvf reviews.tgz -C '../../data/reviews/' && tar -xvf entities.tgz -C '../../data/reviews/' && tar -xvf cities.tgz -C '../../data/reviews/'


Already up to date.
._reviews.csv
reviews.csv
._entities.csv
entities.csv
._cities.csv
cities.csv


## Reviews Dataset

### Collection Creation

In [3]:
import requests
import json

def create_reviews_collection():
    #Create Reviews Collection
    reviews_collection="reviews"

    #delete collection
    requests.get(SOLR_URL + "/admin/collections?action=DELETE&name=" + reviews_collection)
    #delete configSet to start from scratch
    requests.get(SOLR_URL + "/admin/configs?action=DELETE&name=" + reviews_collection + ".AUTOCREATED")

    create_collection(reviews_collection)
    #add_text_tagger_fields(reviews_collection)

    headers={"Content-type": "application/json"}

    schemaCommands = [
            """{
              "add-field-type":{
                "name":"commaDelimited",
                "class":"solr.TextField",
                "positionIncrementGap":100,
                "omitTermFreqAndPositions":true,
                "indexAnalyzer":{
                  "tokenizer":{
                     "class":"solr.PatternTokenizerFactory",
                     "pattern": ",\\\s*"
                  }
                }
              }
            }""",
             """{
              "add-field-type":{
                "name":"pipeDelimited",
                "class":"solr.TextField",
                "positionIncrementGap":100,
                "omitTermFreqAndPositions":true,
                "indexAnalyzer":{
                  "tokenizer":{
                     "class":"solr.PatternTokenizerFactory",
                     "pattern": "\\|\\\s*"
                  }
                }
              }
            }""",
            """{
              "add-field":{"name":"doc_type", "type":"commaDelimited", "stored":true, "multiValued": true}
            }""",
            """{
              "add-copy-field":{"source":"categories_t", "dest":["doc_type"]}
            }""",
            """{
              "add-field":{"name":"location_p", "type":"location", "stored":true}
            }""",
            """{
              "add-copy-field":{"source":"location_pt_s", "dest":["location_p"]}
            }"""
    ]

    for schemaCommand in schemaCommands:
        response = requests.post(SOLR_URL + "/" + reviews_collection + "/schema", headers=headers, data=schemaCommand)        

### Listing 7.5

In [None]:
def enable_text_tagger(collection):
    headers={"Content-type": "application/json"}
    
    schemaCommands = [
        """{
          "add-field-type":{
            "name":"tag",
            "class":"solr.TextField",
            "postingsFormat":"FST50",
            "omitNorms":true,
            "omitTermFreqAndPositions":true,
            "indexAnalyzer":{
              "tokenizer":{
                 "class":"solr.StandardTokenizerFactory" },
              "filters":[
                {"class":"solr.EnglishPossessiveFilterFactory"},
                {"class":"solr.ASCIIFoldingFilterFactory"},
                {"class":"solr.LowerCaseFilterFactory"},
                {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false }
              ]},
            "queryAnalyzer":{
              "tokenizer":{
                 "class":"solr.StandardTokenizerFactory" },
              "filters":[
                {"class":"solr.EnglishPossessiveFilterFactory"},
                {"class":"solr.ASCIIFoldingFilterFactory"},
                {"class":"solr.LowerCaseFilterFactory"}
              ]}
            }
        }""",
        """{
          "add-field":{"name":"surface_form", "type":"string", "stored":true}
        }""",
            """{
          "add-field":{"name":"canonical_form", "type":"string", "stored":true}
        }""",
            """{
          "add-field":{"name":"name", "type":"text_general"}
        }""",
            """{
          "add-field":{"name":"popularity", "type":"pint", "stored":true}
        }""",
            """{
          "add-field":{"name":"name_tag", "type":"tag", "stored":false}
        }""",
            """{
          "add-copy-field":{"source":"name", "dest":["surface_form", "name_tag", "canonical_form"]}
        }""",
            """{
          "add-copy-field":{"source":"population_i", "dest":["popularity"]}
        }""",
            """{
          "add-copy-field":{"source":"surface_form", "dest":["name_tag"]}
        }"""
    ]
    
    for schemaCommand in schemaCommands:
        response = requests.post(SOLR_URL + "/" + collection + "/schema", headers=headers, data=schemaCommand)
        #print(response)    
    
    
    response = requests.post(SOLR_URL + "/" + collection + "/config", headers=headers, data="""{
      "add-requesthandler" : {
        "name": "/tag",
        "class":"solr.TaggerRequestHandler",
        "defaults":{"field":"name_tag"}
      }
    }""")

### Indexing

In [86]:
def index_reviews_collection():
    print("\nLoading Reviews...")
    csvFile = "../data/reviews/reviews.csv"
    reviews_collection = "reviews"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true").option("inferSchema", "true") \
        .option("charset", "utf-8").option("quote", "\"") \
        .option("escape", "\"").option("multiLine","true").option("delimiter", ",") \
        .load(csvFile) \
        .withColumn("poplarity_i", col("stars_i") * 20) \
        .select(
          "id", "name_t", "city_t", "state_t", "text_t", "stars_i", 
          "categories_t",  "location_pt_s", "type_ss", "latitude_d", "longitude_d")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")

In [59]:
def index_reviews_collection():
    print("\nLoading Reviews...")
    csvFile = "../data/reviews/reviews2.csv"
    reviews_collection = "reviews"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true").option("inferSchema", "true") \
        .option("charset", "utf-8").option("quote", "\"") \
        .option("escape", "\"").option("multiLine","true").option("delimiter", ",") \
        .load(csvFile) \
        .withColumn("popularity_i", col("aggregatedRating") * 20) \
        .withColumn("name_t", col("name")) \
        .withColumn("city_t", col("address_city")) \
        .withColumn("state_t", col("address_regionCode")) \
        .withColumn("text_t", col("review_text")) \
        .withColumn("stars_i", col("aggregatedRating")) \
        .withColumn("categories_t", col("categories")) \
        .withColumn("type_ss", col("type")) \
        .withColumn("latitude_d", col("address_lat")) \
        .withColumn("longitude_d", col("address_long")) \
        .select("*")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")
    #        .withColumn("location_pt_s", concat(col("address_lat"), lit(","), col("address_long"))

## Index the Reviews Dataset into the Search Engine

### Listing 7.1

In [87]:
create_reviews_collection()
index_reviews_collection()

Wiping 'reviews' collection
[('action', 'CREATE'), ('name', 'reviews'), ('numShards', 1), ('replicationFactor', 1)]
Creating 'reviews' collection
Status: Success

Loading Reviews...
Reviews Schema: 
root
 |-- id: string (nullable = true)
 |-- name_t: string (nullable = true)
 |-- city_t: string (nullable = true)
 |-- state_t: string (nullable = true)
 |-- text_t: string (nullable = true)
 |-- stars_i: integer (nullable = true)
 |-- categories_t: string (nullable = true)
 |-- location_pt_s: string (nullable = true)
 |-- type_ss: string (nullable = true)
 |-- latitude_d: double (nullable = true)
 |-- longitude_d: double (nullable = true)

Status: Success


## Enities Dataset (Manually-specified Knowledge Graph)

## Collection Creation

In [88]:
def create_entities_collection():
    #Create Entities Collection
    entities_collection="entities"
    #delete collection
    requests.get(SOLR_URL + "/admin/collections?action=DELETE&name=" + entities_collection)
    #delete configSet to start from scratch
    requests.get(SOLR_URL + "/admin/configs?action=DELETE&name=" + entities_collection + ".AUTOCREATED")

    create_collection(entities_collection)
    enable_text_tagger(entities_collection)

## Indexing

In [89]:
def index_entities():
    entities_collection="entities"
    print("Loading Entities...")
    csvFile = "../data/reviews/entities.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": entities_collection, "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\\") \
        .option("multiLine","true") \
        .option("delimiter", ",").load(csvFile)
    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    print("Entities Schema: ")
    csvDF.printSchema()
    print("Status: Success")

# Cities Dataset (Geonames)

In [90]:
#Modify Schema to make some fields explicitly searchable by keyword
#upsert_text_field(jobs_collection, "company_country")
#upsert_text_field(jobs_collection, "job_description")
#upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

def index_cities():
    entities_collection="entities"
    print("Loading Geonames...")
    csvFile = "../data/reviews/cities.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": entities_collection, "gen_uniq_key": "true", "commit_within": "5000"}

    from pyspark.sql.types import StructType,StructField, StringType, IntegerType
    from pyspark.sql.functions import concat_ws

    schema = StructType() \
          .add("id",StringType(),True) \
          .add("name",StringType(),True) \
          .add("ascii_name_s",StringType(),True) \
          .add("alternative_names_s",StringType(),True) \
          .add("latitude_s",StringType(),True) \
          .add("longitude_s",StringType(),True) \
          .add("feature_class_s",StringType(),True) \
          .add("feature_code_s",StringType(),True) \
          .add("StringType",StringType(),True) \
          .add("cc2_s",StringType(),True) \
          .add("admin_code_1_s",StringType(),True) \
          .add("admin_code_2_s",StringType(),True) \
          .add("admin_code_3_s",StringType(),True) \
          .add("admin_code_4_s",StringType(),True) \
          .add("population_i",IntegerType(),True) \
          .add("elevation_s",StringType(),True) \
          .add("dem_s",StringType(),True) \
          .add("timezone_s",StringType(),True) \
          .add("modification_date_s",StringType(),True)

    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\\") \
        .option("multiLine","true") \
        .option("delimiter", "\t") \
        .load(csvFile, schema=schema) \
        .withColumn("type", lit("city")) \
        .withColumn("location_p", concat_ws(",", "latitude_s", "longitude_s"))
        #.show()

    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    #print("Entities Schema: ")
    #csvDF.printSchema()
    print("Status: Success")

### Listing 17.4

In [91]:
create_entities_collection()
index_entities()
index_cities()

Wiping 'entities' collection
[('action', 'CREATE'), ('name', 'entities'), ('numShards', 1), ('replicationFactor', 1)]
Creating 'entities' collection
Status: Success
Loading Entities...
Entities Schema: 
root
 |-- id: integer (nullable = true)
 |-- surface_form: string (nullable = true)
 |-- canonical_form: string (nullable = true)
 |-- type: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- command_function: string (nullable = true)

Status: Success
Loading Geonames...
Entities Schema: 
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ascii_name_s: string (nullable = true)
 |-- alternative_names_s: string (nullable = true)
 |-- latitude_s: string (nullable = true)
 |-- longitude_s: string (nullable = true)
 |-- feature_class_s: string (nullable = true)
 |-- feature_code_s: string (nullable = true)
 |-- StringType: string (nullable = true)
 |-- cc2_s: string (nullable = true)
 |-- admin_code_1_s: string (nullable = true)
 |-- admin_c

## Success!

Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!

Up next: [Semantic search](2.semantic-search.ipynb)