# Setting up the Knowledge Graph Datasets

In [4]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col
spark = SparkSession.builder.appName("ch7").getOrCreate()

## Starting the Reviews Search Web Server and Launching the Search Page

In [5]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    ! pip install staticmap
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [6]:
#Start the web server
start_reviews_search_webserver()

Stopping webserver (pid: 441)
Successfully Started Webserver (pid: 2731)!


172.19.0.1 - - [12/Dec/2022 02:12:46] "POST /process_basic_query HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:12:52] "GET /search?q=bbq HTTP/1.1" 304 -
172.19.0.1 - - [12/Dec/2022 02:12:52] "GET /search?q=bbq HTTP/1.1" 304 -
172.19.0.1 - - [12/Dec/2022 02:12:59] "POST /process_basic_query HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:13:12] "POST /process_basic_query HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:22:19] "POST /process_basic_query HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:22:29] "GET /semantic-search?q=bbq HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:22:29] "GET /semantic-search?q=bbq HTTP/1.1" 200 -
172.19.0.1 - - [12/Dec/2022 02:22:41] "POST /process_semantic_query HTTP/1.1" 200 -


In [6]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


172.31.0.1 - - [05/Oct/2022 16:07:22] "POST /process_basic_query HTTP/1.1" 200 -


### Listing 7.3

In [17]:
! cat ../data/reviews/entities.csv

id,surface_form,canonical_form,type,popularity,command_function
1,near,{location_distance},command,90,"cmd_location_distance(query, position)"
2,in,{location_distance},command,100,"cmd_location_distance(query, position)"
3,by,{location_distance},command,90,"cmd_location_distance(query, position)"
4,by,{text_within_one_edit_distance},command,10,"cmd_text_within_one_edit_distance(query, position)"
5,near,{text_distance},command,10,"cmd_text_distance(query, position)"
6,popular,{popular},command,100,"cmd_popularity(query, position)"
7,top,{popular},command,100,"cmd_popularity(query, position)"
8,best,{popular},command,100,"cmd_popularity(query, position)"
9,good,{popular},command,100,"cmd_popularity(query, position)"
10,violet,violet,color,100,
11,violet crowne,violet crowne,brand,100,
12,violet crowne charlottesville,violet crowne charlottesville,movie_theater,100,
13,violet crown,violet crowne,brand,100,
14,violet crown charlottesville,violet crowne charlottesville,movie_theater,100,
15

0

In [7]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 2731)


### Listing 7.6

In [16]:
import json;

def tag_query(post_body):
    return requests.post(SOLR_URL + '/entities/tag?json.nl=map&sort=popularity%20desc&matchText=true&echoParams=none&fl=id,type,canonical_form,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,command_function', post_body).text

json.loads(tag_query("top kimchi near charlotte"))

{'responseHeader': {'status': 0, 'QTime': 0},
 'tagsCount': 3,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'response': {'numFound': 8,
  'start': 0,
  'docs': [{'id': '1',
    'canonical_form': '{location_distance}',
    'type': 'command',
    'popularity': 90,
    'command_function': 'cmd_location_distance(query, position)'},
   {'id': '5',
    'canonical_form': '{text_distance}',
    'type': 'command',
    'popularity': 10,
    'command_function': 'cmd_text_distance(query, position)'},
   {'id': '7',
    'canonical_form': '{popular}',
    'type': 'command',
    'popularity': 100,
    'command_function': 'cmd_popularity(query, position)'},
   {'id': '4460243',
    'canonical_form': 'Charlotte',
    'popularity': 827097,
    'type': 

### Listing 7.7

In [20]:
def process_semantic_query(query_bytes):
    text = query_bytes.decode('UTF-8')
    tagged_response = json.loads(tag_query(query_bytes))
    query_tree, tagged_query, parsed_query, doc_map = [], "", "", {}

    if (tagged_response['response'] and tagged_response['response']['docs']):
        for doc in tagged_response['response']['docs']: 
            doc_map[doc['id']] = doc 

    if (tagged_response['tags'] is not None):
        tags, lastEnd, metaData = tagged_response['tags'], 0, {}

        for tag in tags:
            matchText, doc_ids, best_doc_id = tag['matchText'], tag['ids'], None

            for doc_id in doc_ids: 
                if (best_doc_id): 
                    if (doc_map[doc_id]['popularity'] > doc_map[best_doc_id]['popularity']): 
                        best_doc_id = doc_id 
                else: 
                    best_doc_id = doc_id 
            best_doc = doc_map[best_doc_id] 

            nextText = text[lastEnd:tag['startOffset']].strip() 
            if (len(nextText) > 0):  
                query_tree.append({ "type":"keyword", "known":False, "surface_form":nextText, "canonical_form":nextText })
                tagged_query += " " + nextText
                parsed_query += " " + "{ type:keyword, known: false, surface_form: \"" + nextText + "\"}"
            query_tree.append(best_doc) 

            tagged_query += " {" + matchText + "}"
            parsed_query += json.dumps(best_doc)
            lastEnd = tag['endOffset']

        if (lastEnd < len(text)):
            finalText = text[lastEnd:len(text)].strip() 
            if (len(finalText) > 0): 
                query_tree.append({ "type":"keyword", "known":False, "surface_form":finalText, "canonical_form":finalText }) 
                tagged_query += " " + finalText
                parsed_query += " " + "{ type:keyword, known: false, surface_form: \"" + finalText + "\"}"
    
    final_query = resolve_query(query_tree)
    resolved_query = query_tree_to_resolved_string(query_tree) 

    response = {
        "query_tree": query_treey, "tagger_data": tagged_response, "tagged_query": tagged_query, "parsed_query": parsed_query, "resolved_query": resolved_query,
        
    }
    return response

### Listing 7.8

In [21]:
def cmd_popularity(query, position):
    if (len(query['query_tree']) -1 > position): 
        query['query_tree'][position] = {"type":"solr", "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}' } 
        return True 
    else:
        return False 

### Listing 7.9

In [22]:
def cmd_location_distance(query, position):

    if (len(query['query_tree']) -1 > position): 
        nextEntity = query['query_tree'][position + 1] 
        if (nextEntity['type'] == "city"): 

            query['query_tree'].pop(position + 1); 
            query['query_tree'][position] = {"type":"solr",
                                             "query": create_geo_filter(nextEntity['location_p'],
                                             "location_p", 50)} 
            return True

    return False 

def create_geo_filter(coordinates, field, distanceInKM):
    return "+{!geofilt d=" + str(distanceInKM) + " sfield=\"" + field + "\" pt=\"" + coordinates + "\"}"

### Listing 7.10

In [23]:
def get_category_and_term_vector_solr_response(keyword):
    query = {
        "params": { "fore": keyword, "back": "*:*", "df": "text_t" },
        "query": "*:*", "limit": 0,
        "facet": {
            "term_needing_vector": {
                "type": "query", "query": keyword,
                "facet": {
                    "related_terms" : {
                        "type" : "terms", "field" : "text_t", "limit": 3, "sort": { "r1": "desc" },
                        "facet" : { "r1" : "relatedness($fore,$back)" }},
                    "doc_type" : {
                        "type" : "terms", "field" : "doc_type", "limit": 1, "sort": { "r2": "desc" },
                        "facet" : { "r2" : "relatedness($fore,$back)"  }}}}}}

    response = run_search(query)
    return json.loads(response)

### Listing 7.11

In [25]:
def process_query_tree(query_tree):
    query_tree = process_semantic_functions(query_tree) 

    for position in range(len(query_tree)):
        item = query_tree[position];
        if (item["type"] != "solr"): 
            if (item["type"] == "keyword"):  
                categoryAndTermVector = None 
                solrResponse = get_category_and_term_vector_solr_response(item["surface_form"]) 
                categoryAndTermVector = parse_category_and_term_vector_from_solr_response(solrResponse) 

                queryString = "" 
                if ("term_vector" in categoryAndTermVector): 
                    queryString = categoryAndTermVector["term_vector"] 

                if ("category" in categoryAndTermVector): 
                    if (len(queryString) > 0): 
                        queryString += " " 
                        queryString += "+doc_type:\"" + categoryAndTermVector["category"] + "\"" 

                if (len(queryString) == 0): 
                    queryString = item["surface_form"] 

                query_tree[position] = { "type":"solr", "query": "+{!edismax v=\"" + escapeQuotesInQuery(queryString) + "\"}" }
            
            elif (item["type"] == "known_item" or item["type"] == "city" or item["type"] == "event"): 
                solrQuery = "+name_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            elif (item["type"] == "brand"): 
                solrQuery = "+brand_s:\"" + item["canonical_form"] + "\""
                query_tree[position] = {"type":"solr", "query": solrQuery}
            else: 
                query_tree[position] = {"type":"solr", "query": "+{!edismax v=\"" + escapeQuotesInQuery(item["surface_form"]) + "\"}"}

    return query_tree

### Listing 7.12

In [28]:
def query_tree_to_resolved_string(query_tree):
    resolved_query = ""
    for i in range(len(query_tree)):
        if (len(resolved_query) > 0):
            resolved_query += " "

        resolved_query += query_tree[i]['query']

    return resolved_query

def run_search(text):
    q = urllib.parse.quote(text)
    qf, defType = "text_t", "lucene"

    return requests.get(SOLR_URL + "/reviews/select?q=" + q + "&qf=" + qf + "&defType=" + defType).text

## Success!
