# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append("../webserver")
from aips import *
from webserver.semantic_search import *
from webserver.semantic_search.engine import *
from webserver.semantic_search.engine.text_tagger import *
from webserver.semantic_search.query_tree import *
engine = get_engine()

## Starting the Reviews Search Web Server and Launching the Search Page

In [8]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [9]:
#Start the web server
start_reviews_search_webserver()


### Figure 7.2

In [10]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


### Figure 7.3

In [11]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [12]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [13]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [14]:
%%html
<iframe src="http://localhost:2345/search?q=top+korean+bbq+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [15]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+korean+bbq+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [16]:
!! cat ../data/reviews/entities.csv

['cat: ../data/reviews/entities.csv: No such file or directory']

### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).

### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [53]:
params = """
json.nl=map&sort=popularity%20desc&matchText=true&echoParams=all&
fl=id,type,canonical_form,surface_form,name,country:countrycode_s,
admin_area:admin_code_1_s,popularity,*_p,semantic_function"""
query = "top kimchi near charlotte"
tagger = TextTagger("entities")
tagger_data = tagger.tag_query(query, params)
tagger_data

{'responseHeader': {'status': 0,
  'QTime': 0,
  'params': {'json.nl': 'map',
   'field': 'name_tag',
   'echoParams': 'all',
   'fl': 'id,type,canonical_form,surface_form,name,country:countrycode_s,\nadmin_area:admin_code_1_s,popularity,*_p,semantic_function',
   'sort': 'popularity desc',
   'matchText': 'true'}},
 'tagsCount': 3,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11, 'endOffset': 15, 'matchText': 'near', 'ids': ['1', '5']},
  {'startOffset': 16,
   'endOffset': 25,
   'matchText': 'charlotte',
   'ids': ['4460243', '4612828', '4680560', '4988584', '5234793']}],
 'response': {'numFound': 8,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': '1',
    'surface_form': 'near',
    'canonical_form': '{location_distance}',
    'type': 'semantic_function',
    'popularity': 90,
    'semantic_function': 'location_distance(query, position)'},
   {'id': '5',
    'surface_form': 'near',
    'canonical_form': '{text_distance}'

### Listing 7.7

In [54]:
# %load -s process_semantic_query ../webserver/semantic_search/__init__.py
def process_semantic_query(collection, query):
    query_bytes = bytes(query, "UTF-8")
    tagger_data = TextTagger("entities").tag_query(query_bytes)
    
    tagged_query = generate_tagged_query(query, tagger_data)
    query_tree = generate_query_tree(query, tagger_data)
    parsed_query = json.dumps(query_tree)
    enriched_query_tree = enrich(collection, query_tree)
    transformed = transform_query(enriched_query_tree)
    
    return {
        "tagged_query": tagged_query,
        "parsed_query": parsed_query, 
        "transformed_query": to_query_string(transformed),
        "tagger_data": tagger_data
    }


In [55]:
reviews_collection = engine.get_collection("reviews")
query = "top kimchi near charlotte"

processed_query = process_semantic_query(reviews_collection, query)

print("Tagged Query:" + processed_query["tagged_query"])
print("\nParsed Query:" + processed_query["parsed_query"])
print("\nTransformed Query:" + processed_query["transformed_query"])

Tagged Query: {top} kimchi {near} {charlotte}

Parsed Query:[{"id": "7", "surface_form": "top", "canonical_form": "{popular}", "type": "semantic_function", "popularity": 100, "semantic_function": "popularity(query, position)"}, {"type": "keyword", "known": false, "surface_form": "kimchi", "canonical_form": "kimchi"}, {"id": "1", "surface_form": "near", "canonical_form": "{location_distance}", "type": "semantic_function", "popularity": 90, "semantic_function": "location_distance(query, position)"}, {"id": "4460243", "name": "Charlotte", "canonical_form": "Charlotte", "surface_form": "Charlotte", "popularity": 827097, "type": "city", "location_p": "35.22709,-80.84313", "admin_area": "NC"}]

Transformed Query:+{!func v="mul(if(stars_i,stars_i,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\"Korean\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}


### Listing 7.8

In [24]:
# %load -s popularity ../webserver/semantic_search/semantic_functions/__init__.py
def popularity(query, position):
    if (len(query["query_tree"]) -1 > position):
        query["query_tree"][position] = {
            "type": "engine",
            "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False


### Listing 7.9

In [25]:
# %load -s location_distance,create_geo_filter ../webserver/semantic_search/semantic_functions/__init__.py
def location_distance(query, position):
    if (len(query["query_tree"]) -1 > position):
        next_entity = query["query_tree"][position + 1]
        if (next_entity["type"] == "city"):
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "engine",
                "query": create_geo_filter(next_entity['location_p'],
                "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_in_KM):
    return "+{!geofilt d=" + str(distance_in_KM) + ' sfield="' + field + '" pt="' + coordinates + '"}'


### Listing 7.10

In [49]:
# %load -s traverse_skg,parse_skg_response ../webserver/semantic_search/engine/semantic_knowledge_graph.py
def traverse_skg(collection, keyword):
    query = {
        "params": {"fore": keyword, "back": "*:*", "df": "text_t"},
        "query": "*:*", "limit": 0,
        "facet": {
            "term_needing_vector": {
                "type": "query", "query": keyword,
                "facet": {
                    "related_terms": {
                        "type": "terms", "field": "text_t",
                        "limit": 3, "sort": {"r1": "desc"},
                        "facet": {"r1": "relatedness($fore,$back)"}},
                "doc_type": {
                    "type": "terms", "field": "doc_type",
                    "limit": 1, "sort": {"r2": "desc"},
                    "facet": {"r2": "relatedness($fore,$back)"}}}}}}    
    return collection.search(query)

def parse_skg_response(skg_response):
    parsed = {}
    related_term_nodes = {}
    if "facets" in skg_response and "term_needing_vector" in skg_response["facets"]:    
        if ("doc_type" in skg_response["facets"]["term_needing_vector"] and 
            "buckets" in skg_response["facets"]["term_needing_vector"]["doc_type"] and
            len(skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"]) > 0):
            parsed["category"] = skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"][0]["val"] #just top one for now
    
        if ("related_terms" in skg_response["facets"]["term_needing_vector"] 
            and "buckets" in skg_response["facets"]["term_needing_vector"]["related_terms"] 
          and len(skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]) > 0): #at least one entry    
            related_term_nodes = skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]
                
    term_vector = ""
    for related_term_node in related_term_nodes:
        if len(term_vector) > 0: term_vector += " " 
        term_vector += related_term_node["val"] + "^" + "{:.4f}".format(related_term_node["r1"]["relatedness"])
    
    parsed["term_vector"] = term_vector

    return parsed


In [29]:
query = "kimchi"
reviews_collection = engine.get_collection("reviews")
skg_response = traverse_skg(reviews_collection, query)
parse_skg_response(skg_response)

{'category': 'Korean',
 'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}

In [30]:
def skg(query):
    return parse_skg_response(traverse_skg(reviews_collection, query))

other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries: print(f"{query}: {skg(query)}")


bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6186 pork^0.5991'}
korean bbq: {'category': 'Korean', 'term_vector': 'bbq^0.9052 korean^0.8641 pork^0.6079'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256'}
drive through: {'category': 'Fast Food', 'term_vector': 'through^0.8999 drive^0.8613 thru^0.6118'}


### Listing 7.11

In [None]:
# %load ../webserver/semantic_search/query_tree/__init__.py
def escape_quotes_in_query(query):
    return query.replace('"', '\\"')

def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            skg_response = traverse_skg(collection, item["surface_form"])
            enrichments = parse_skg_response(skg_response)
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        additional_query = ""
        match item["type"]:
            case "engine":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                additional_query = '{!edismax v="' + escape_quotes_in_query(query_string) + '"}'
            case "color":
                additional_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                additional_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                additional_query = f'+city_t:"{str(item["name"])}"'
            case "brand":
                additional_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                additional_query = "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"
        if additional_query:
            query_tree[i] = {"type": "engine", "query": additional_query}                    
    return query_tree

def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        item = query_tree[position]        
        # process commands. For now, going left to right and then sorting by priority when ambiguous commands occur; 
        # consider other weighting options later.
        if (item['type'] == "semantic_function"):
            commandIsResolved = False
    
            command = item['semantic_function']

            if (command):
                query = {"query_tree": query_tree} #pass by-ref
                commandIsResolved = eval(item['semantic_function']); #Careful... there is code in the docs that is being eval'd. 
                #MUST ENSURE THESE DOCS ARE SECURE, OTHERWISE THIS WILL INTRODUCE A POTENTIAL SECURITY THREAT (CODE INJECTION)
            
            #else:
                #Alert ("Error: " + query.query_tree.canonical_form + " has no command function.");
            
            if (False == commandIsResolved):
                #Bad command. Just remove for now... could alternatively keep it and run as a keyword
                query_tree.pop(position) #.splice(position,1)  

        position += 1

    return query_tree 

In [42]:
query = "good kimchi near charlotte"
tagger_data = engine.tag_query("entities", (bytes(query, "utf-8")))
query_tree = generate_query_tree(query, tagger_data)
enriched_query_tree = enrich(reviews_collection,query_tree)
processed_query_tree = transform_query(enriched_query_tree)

### Listing 7.12

In [43]:
# %load -s to_query_string ../webserver/semantic_search/query_tree/__init__.py
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])


In [44]:
query_string = to_query_string(query_tree)
query_string

'+{!func v="mul(if(stars_i,stars_i,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'

In [45]:
reviews_collection = engine.get_collection("reviews")
request = {"query": query_string}
reviews_collection.search(request)

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 6,
  'params': {'json': '{"query": "+{!func v=\\"mul(if(stars_i,stars_i,0),20)\\"} {!edismax v=\\"kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\\\\\"Korean\\\\\\"\\"} +{!geofilt d=50 sfield=\\"location_p\\" pt=\\"35.22709,-80.84313\\"}"}'}},
 'response': {'numFound': 15842,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': 'WnLhd38sH80ViWwzyF7yoA',
    'name_t': 'Hibiscus',
    'city_t': 'Charlotte',
    'state_t': 'NC',
    'text_t': "We ate here for dinner and had a very tasty meal of bibimbap and bulgogi. Both dishes were done well and tasty. We had a very pleasant waitress who provided great service as well. Overall great! We'll definitely be back.",
    'stars_i': 5,
    'categories_t': 'Restaurants, Korean, Thai',
    'doc_type': ['Restaurants, Korean, Thai'],
    'location_pt_s': '35.171873,-80.849032',
    'location_p': '35.171873,-80.849032',
    'type_ss': ['Restaurants, Korean, Thai'],
    'la

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 47169)


## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)