# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append("webserver")
from aips import *
from webserver.semantic_search.engine import *
engine = get_engine()

## Starting the Reviews Search Web Server and Launching the Search Page

In [None]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    ! pip install staticmap
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [None]:
#Start the web server
start_reviews_search_webserver()


In [None]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [None]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [None]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [None]:
! cat ../data/reviews/entities.csv

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

### Listing 7.6

In [6]:
# %load -s tag_query webserver/semantic_search/engine/__init__
def tag_query():
    return '''json.nl=map&sort=popularity%20desc&matchText=true&
        echoParams=all&fl=id,type,canonical_form,name,country:countrycode_s,
        admin_area:admin_code_1_s,popularity,*_p,semantic_function'''


In [9]:
entities_collection = engine.get_collection("entities")
request =  tag_query()
engine.tag(entities_collection, request, "top kimchi near charlotte")

TypeError: tag_query() takes 0 positional arguments but 1 was given

### Listing 7.7

In [None]:
# %load -s process_semantic_query ../webserver/semantic_search
def process_semantic_query(collection, query_bytes):
    text = query_bytes.decode('UTF-8')
    request =  tag_request("top kimchi near charlotte")
    tagger_data = engine.tag(collection, request)
    
    final_query = resolve_query(query_tree)
    resolved_query = query_tree_to_resolved_string(query_tree) 

    response = {
        "query_tree": query_tree, "tagger_data": tagged_response, "tagged_query": tagged_query, 
        "parsed_query": parsed_query, "resolved_query": resolved_query,
        
    }

    return response


In [None]:
query_bytes = bytes("top kimchi near charlotte", 'UTF-8')
processed_query = process_semantic_query(entities_collection, query_bytes)
print("Tagged Query:" + processed_query["tagged_query"])
print("\nEnriched Query:" + processed_query["enriched_query"])
print("\nTransformed Query:" + processed_query["transformed_query"])

### Listing 7.8

In [None]:
# %load -s popularity ../webserver/semantic_search/semantic_functions
def popularity(query, position):
    if (len(query['query_tree']) -1 > position):
        query['query_tree'][position] = {"type":"solr", "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False 

### Listing 7.9

In [None]:
# %load -s location_distance,create_geo_filter ../webserver/semantic_search/semantic_functions
def location_distance(query, position):      
    if (len(query["query_tree"]) -1 > position):
        nextEntity = query["query_tree"][position + 1]
        if (nextEntity["type"] == "city"):
        
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = { "type":"solr", 
                                             "query": create_geo_filter(nextEntity['location_p'], 
                                             "location_p", 50)}
            return True

    return False 

def create_geo_filter(coordinates, field, distanceInKM):
    return "+{!geofilt d=" + str(distanceInKM) + " sfield=\"" + field + "\" pt=\"" + coordinates + "\"}"

### Listing 7.10

In [None]:
# %load -s traverse_skg webserver/semantic_search/engine/semantic_knowledge_graph
def traverse_skg(collection, keyword):
    query = {
      "params": {"fore": keyword, "back": "*:*", "df": "text_t"},
      "query": "*:*", "limit": 0,
      "facet": {
        "term_needing_vector": {
          "type": "query", "query": keyword,
          "facet": {
            "related_terms": {
              "type": "terms", "field": "text_t",
              "limit": 3, "sort": {"r1": "desc"},
              "facet": {"r1": "relatedness($fore,$back)"}},
            "doc_type": {
              "type": "terms", "field": "doc_type",
              "limit": 1, "sort": {"r2": "desc"},
              "facet": {"r2": "relatedness($fore,$back)"}}}}}}
    return collection.search(query)


In [None]:
# %load -s parse_skg_response ../webserver/semantic_search/engine/semantic_knowledge_graph
def parse_skg_response(skg_response):
    parsed = {}
    related_term_nodes = {}
    if ("facets" in skg_response and "term_needing_vector" in skg_response["facets"]):    
        if ("doc_type" in skg_response["facets"]["term_needing_vector"] 
          and "buckets" in skg_response["facets"]["term_needing_vector"]["doc_type"] 
          and len(skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"]) > 0 ):
            parsed["category"] = skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"][0]["val"] #just top one for now
    
        if ("related_terms" in skg_response["facets"]["term_needing_vector"] 
          and "buckets" in skg_response["facets"]["term_needing_vector"]["related_terms"] 
          and len(skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]) > 0 ): #at least one entry    
            related_term_nodes = skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]
                
    term_vector = ""
    for related_term_node in related_term_nodes:
        if (len(term_vector) > 0): term_vector += " " 
        term_vector += related_term_node["val"] + "^" + "{:.4f}".format(related_term_node["r1"]["relatedness"])
    
    parsed["term_vector"] = term_vector

    return parsed


In [None]:
query = "kimchi"
reviews_collection = engine.get_collection("reviews")
skg_response = traverse_skg(reviews_collection, query)
parse_skg_response(skg_response)

In [None]:
def skg(query):
    return parse_skg_response(traverse_skg(query))

other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries: print(f"{query}: {skg(query)}")


### Listing 7.11

In [None]:
#%load process_query_tree ../webserver/semantic_search/query_tree
def escape_quotes_in_query(query):
    return query.replace('"', '\\"')

def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

def enrich(query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            skg_response = traverse_skg(item["surface_form"])
            enrichments = parse_skg_response(skg_response)
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        additional_query = ""
        match item["type"]:
            case "solr":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                additional_query = '{!edismax v="' + escape_quotes_in_query(query_string) + '"}'
            case "color":
                additional_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                additional_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                additional_query = f'+city_t:"{str(item["name"])}"'
            case "brand":
                additional_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                additional_query = "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"
        if additional_query:
            query_tree[i] = {"type": "solr", "query": additional_query}                    
    return query_tree

def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        item = query_tree[position]        
        # process commands. For now, going left to right and then sorting by priority when ambiguous commands occur; 
        # consider other weighting options later.
        if (item['type'] == "semantic_function"):
            commandIsResolved = False
    
            command = item['semantic_function']

            if (command):
                query = {"query_tree": query_tree} #pass by-ref
                commandIsResolved = eval(item['semantic_function']); #Careful... there is code in the docs that is being eval'd. 
                #MUST ENSURE THESE DOCS ARE SECURE, OTHERWISE THIS WILL INTRODUCE A POTENTIAL SECURITY THREAT (CODE INJECTION)
            
            #else:
                #Alert ("Error: " + query.query_tree.canonical_form + " has no command function.");
            
            if (False == commandIsResolved):
                #Bad command. Just remove for now... could alternatively keep it and run as a keyword
                query_tree.pop(position) #.splice(position,1)  

        position += 1

    return query_tree 

In [None]:
query = "good kimchi in charlotte"
tagger_data = json.loads(tag_query(bytes(query, "utf-8")))
query_tree = generate_query_tree(query, tagger_data)
enriched_query_tree = enrich(query_tree)
processed_query_tree = transform_query(enriched_query_tree)

### Listing 7.12

In [None]:
# %load -s to_search_request ../webserver/semantic_search/query_tree/transform
def to_search_request(query_tree):
    return " ".join([node["query"] for node in query_tree])


In [None]:
### TODO: the following is added to the manuscript. Should decouple query_tree generation so it can be passed in and just work
#collection = engine.get_collection("reviews")
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])
        
query_string = to_search_request(query_tree)
query_string

In [None]:
reviews_collection = engine.get_collection("reviews")
request = {"query": query_string}
response = reviews_collection.search(request)

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)