# Setting up the Knowledge Graph Datasets

In [8]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("ch7").getOrCreate()

## Download the Datasets

## Reviews Dataset

### Collection Creation

In [9]:
import requests
import json

def create_reviews_collection():
    #Create Reviews Collection
    reviews_collection="reviews"

    #delete collection
    requests.get(solr_url + "admin/collections?action=DELETE&name=" + reviews_collection)
    #delete configSet to start from scratch
    requests.get(solr_url + "admin/configs?action=DELETE&name=" + reviews_collection + ".AUTOCREATED")

    create_collection(reviews_collection)
    #add_text_tagger_fields(reviews_collection)

    headers={"Content-type": "application/json"}

    schemaCommands = [
                """{
              "add-field":{"name":"doc_type", "type":"string", "stored":true, "multiValued": true}
            }""",
                """{
              "add-copy-field":{"source":"type_ss", "dest":["doc_type"]}
            }""",
                    """{
              "add-field":{"name":"location_p", "type":"location", "stored":true}
            }""",
                """{
              "add-copy-field":{"source":"location_pt_s", "dest":["location_p"]}
            }"""
    ]

    for schemaCommand in schemaCommands:
        response = requests.post(solr_url + reviews_collection + "/schema", headers=headers, data=schemaCommand)
        #print(response)   
        
def enable_text_tagger(collection):
    headers={"Content-type": "application/json"}
    
    schemaCommands = [
        """{
          "add-field-type":{
            "name":"tag",
            "class":"solr.TextField",
            "postingsFormat":"FST50",
            "omitNorms":true,
            "omitTermFreqAndPositions":true,
            "indexAnalyzer":{
              "tokenizer":{
                 "class":"solr.StandardTokenizerFactory" },
              "filters":[
                {"class":"solr.EnglishPossessiveFilterFactory"},
                {"class":"solr.ASCIIFoldingFilterFactory"},
                {"class":"solr.LowerCaseFilterFactory"},
                {"class":"solr.ConcatenateGraphFilterFactory", "preservePositionIncrements":false }
              ]},
            "queryAnalyzer":{
              "tokenizer":{
                 "class":"solr.StandardTokenizerFactory" },
              "filters":[
                {"class":"solr.EnglishPossessiveFilterFactory"},
                {"class":"solr.ASCIIFoldingFilterFactory"},
                {"class":"solr.LowerCaseFilterFactory"}
              ]}
            }
        }""",
            """{
          "add-field":{"name":"surface_form", "type":"string", "stored":true}
        }""",
            """{
          "add-field":{"name":"canonical_form", "type":"string", "stored":true}
        }""",
            """{
          "add-field":{"name":"name", "type":"text_general"}
        }""",
            """{
          "add-field":{"name":"popularity", "type":"pint", "stored":true}
        }""",
            """{
          "add-field":{"name":"name_tag", "type":"tag", "stored":false}
        }""",
            """{
          "add-copy-field":{"source":"name", "dest":["surface_form", "name_tag", "canonical_form"]}
        }""",
            """{
          "add-copy-field":{"source":"population_i", "dest":["popularity"]}
        }""",
            """{
          "add-copy-field":{"source":"surface_form", "dest":["name_tag"]}
        }"""
    ]
    
    for schemaCommand in schemaCommands:
        response = requests.post(solr_url + collection + "/schema", headers=headers, data=schemaCommand)
        #print(response)    
    
    
    response = requests.post(solr_url + collection + "/config", headers=headers, data="""{
      "add-requesthandler" : {
        "name": "/tag",
        "class":"solr.TaggerRequestHandler",
        "defaults":{"field":"name_tag"}
      }
    }""")
    #print(response)

### Indexing

In [None]:
def index_reviews_collection():
    print("\nLoading Reviews...")
    csvFile = "../data/semantic-search/reviews.csv"
    reviews_collection = "reviews"
    reviews_update_opts={"zkhost": "aips-zk", "collection": reviews_collection, 
                        "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true").option("inferSchema", "true") \
        .option("charset", "utf-8").option("quote", "\"") \
        .option("escape", "\"").option("multiLine","true").option("delimiter", ",") \
        .load(csvFile) \
        .withColumn("poplarity_i", col("stars_i") * 20) \
        .select(
          "id", "name_t", "city_t", "state_t", "text_t", "stars_i", "popularity_i", 
          "categories_t",  "location_pt_s", "type_ss", "latitude_d", "longitude_d")
    csvDF.write.format("solr").options(**reviews_update_opts).mode("overwrite").save()
    print("Reviews Schema: ")
    csvDF.printSchema()
    print("Status: Success")

## Index the Reviews Dataset into the Search Engine

### Listing 7.1

In [10]:
create_reviews_collection()
index_reviews_collection()

Wiping 'reviews' collection
[('action', 'CREATE'), ('name', 'reviews'), ('numShards', 1), ('replicationFactor', 1)]
Creating reviews' collection
Status: Success

Loading Reviews...
Reviews Schema: 
root
 |-- id: string (nullable = true)
 |-- name_t: string (nullable = true)
 |-- city_t: string (nullable = true)
 |-- state_t: string (nullable = true)
 |-- text_t: string (nullable = true)
 |-- stars_i: integer (nullable = true)
 |-- popularity_i: integer (nullable = true)
 |-- categories_t: string (nullable = true)
 |-- location_pt_s: string (nullable = true)
 |-- type_ss: string (nullable = true)
 |-- latitude_d: double (nullable = true)
 |-- longitude_d: double (nullable = true)

Status: Success


## Enities Dataset (Manually-specified Knowledge Graph)

## Collection Creation

In [None]:
def create_entities_collection():
    #Create Entities Collection
    entities_collection="entities"
    #delete collection
    requests.get(solr_url + "admin/collections?action=DELETE&name=" + entities_collection)
    #delete configSet to start from scratch
    requests.get(solr_url + "admin/configs?action=DELETE&name=" + entities_collection + ".AUTOCREATED")

    create_collection(entities_collection)
    enable_text_tagger(entities_collection)

## Indexing

In [None]:
def index_entities():
    entities_collection="entities"
    print("Loading Entities...")
    csvFile = "../data/semantic-search/entities-manual.csv"
    entities_update_opts={"zkhost": "aips-zk", "collection": entities_collection, "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\\") \
        .option("multiLine","true") \
        .option("delimiter", ",").load(csvFile)
    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    print("Entities Schema: ")
    csvDF.printSchema()
    print("Status: Success")

# Cities Dataset (Geonames)

In [19]:
#Modify Schema to make some fields explicitly searchable by keyword
#upsert_text_field(jobs_collection, "company_country")
#upsert_text_field(jobs_collection, "job_description")
#upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

def index_cities():
    entities_collection="entities"
    print("Loading Geonames...")
    csvFile = "../data/semantic-search/cities1000.txt"
    entities_update_opts={"zkhost": "aips-zk", "collection": entities_collection, "gen_uniq_key": "true", "commit_within": "5000"}

    from pyspark.sql.types import StructType,StructField, StringType, IntegerType
    from pyspark.sql.functions import concat_ws

    schema = StructType() \
          .add("id",StringType(),True) \
          .add("name",StringType(),True) \
          .add("ascii_name_s",StringType(),True) \
          .add("alternative_names_s",StringType(),True) \
          .add("latitude_s",StringType(),True) \
          .add("longitude_s",StringType(),True) \
          .add("feature_class_s",StringType(),True) \
          .add("feature_code_s",StringType(),True) \
          .add("StringType",StringType(),True) \
          .add("cc2_s",StringType(),True) \
          .add("admin_code_1_s",StringType(),True) \
          .add("admin_code_2_s",StringType(),True) \
          .add("admin_code_3_s",StringType(),True) \
          .add("admin_code_4_s",StringType(),True) \
          .add("population_i",IntegerType(),True) \
          .add("elevation_s",StringType(),True) \
          .add("dem_s",StringType(),True) \
          .add("timezone_s",StringType(),True) \
          .add("modification_date_s",StringType(),True)

    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\\") \
        .option("multiLine","true") \
        .option("delimiter", "\t") \
        .load(csvFile, schema=schema) \
        .withColumn("type", lit("city")) \
        .withColumn("location_p", concat_ws(",", "latitude_s", "longitude_s"))
        #.show()

    csvDF.write.format("solr").options(**entities_update_opts).mode("overwrite").save()
    print("Entities Schema: ")
    csvDF.printSchema()
    print("Status: Success")

Loading Geonames...
Entities Schema: 
root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- ascii_name_s: string (nullable = true)
 |-- alternative_names_s: string (nullable = true)
 |-- latitude_s: string (nullable = true)
 |-- longitude_s: string (nullable = true)
 |-- feature_class_s: string (nullable = true)
 |-- feature_code_s: string (nullable = true)
 |-- StringType: string (nullable = true)
 |-- cc2_s: string (nullable = true)
 |-- admin_code_1_s: string (nullable = true)
 |-- admin_code_2_s: string (nullable = true)
 |-- admin_code_3_s: string (nullable = true)
 |-- admin_code_4_s: string (nullable = true)
 |-- population_i: integer (nullable = true)
 |-- elevation_s: string (nullable = true)
 |-- dem_s: string (nullable = true)
 |-- timezone_s: string (nullable = true)
 |-- modification_date_s: string (nullable = true)
 |-- type: string (nullable = false)
 |-- location_p: string (nullable = false)

Status: Success


## Starting the Reviews Search Web Server and Launching the Search Page

### Listing 7.2

In [None]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    ! pip install staticmap
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

In [4]:
#Start the web server
start_reviews_search_webserver()

Collecting staticmap
  Downloading staticmap-0.5.5.tar.gz (5.6 kB)
Building wheels for collected packages: staticmap
  Building wheel for staticmap (setup.py) ... [?25ldone
[?25h  Created wheel for staticmap: filename=staticmap-0.5.5-py3-none-any.whl size=6460 sha256=508182f16ea939a507398d81a3b8080d5e30eae6b270da6819ff91a58c54da16
  Stored in directory: /home/jovyan/.cache/pip/wheels/26/a1/4d/0c9eff264ca4fba0b3a4d66e62f5d53f21471cb200719a9aba
Successfully built staticmap
Installing collected packages: staticmap
Successfully installed staticmap-0.5.5
Successfully Started Webserver (pid: 145)!


In [8]:
%%html
<iframe src="http://localhost:2345/search?q=bbq" width="100%" height="800"></iframe>

In [122]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 1005)


## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)