# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append("webserver")
from aips import *
from webserver.semantic_search import *
from webserver.semantic_search.engine import *
from webserver.semantic_search.query_tree import *
engine = get_engine()

## Starting the Reviews Search Web Server and Launching the Search Page

In [20]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    ! pip install staticmap
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [21]:
#Start the web server
start_reviews_search_webserver()




In [22]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [23]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [24]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [25]:
%%html
<iframe src="http://localhost:2345/search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [26]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [27]:
! cat ../data/reviews/entities.csv

In [2]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

### Listing 7.6

In [29]:
# %load -s tag_query solr.SolrEngine
def tag_query(collection_name, query):
    url_params = "json.nl=map&sort=popularity%20desc&matchText=true&echoParams=all&fl=id,type,canonical_form,surface_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function"
    return requests.post(f"{SOLR_URL}/{collection_name}/tag?{url_params}", query).json()

In [3]:
tag_query("entities", "top kimchi near charolette")

{'responseHeader': {'status': 0,
  'QTime': 0,
  'params': {'sort': 'popularity desc',
   'matchText': 'true',
   'json.nl': 'map',
   'field': 'name_tag',
   'echoParams': 'all',
   'fl': 'id,type,canonical_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function'}},
 'tagsCount': 2,
 'tags': [{'startOffset': 0, 'endOffset': 3, 'matchText': 'top', 'ids': ['7']},
  {'startOffset': 11,
   'endOffset': 15,
   'matchText': 'near',
   'ids': ['1', '5']}],
 'response': {'numFound': 3,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': '1',
    'canonical_form': '{location_distance}',
    'type': 'semantic_function',
    'popularity': 90,
    'semantic_function': 'location_distance(query, position)'},
   {'id': '5',
    'canonical_form': '{text_distance}',
    'type': 'semantic_function',
    'popularity': 10,
    'semantic_function': 'text_distance(query, position)'},
   {'id': '7',
    'canonical_form': '{popular}',
    'type': 'semantic_function',
 

### Listing 7.7

In [7]:
# %load -s process_semantic_query webserver/semantic_search/__init__
def process_semantic_query(collection, query_bytes):
    text = query_bytes.decode('UTF-8')
    tagger_data = tag_query("entities", query_bytes)
    
    query_tree, tagged_query, enriched_query = generate_query_representations(text, tagger_data)
        
    enriched_query_tree = enrich(collection, query_tree)
    transformed = transform_query(enriched_query_tree)
    query_string = to_query_string(transformed)      

    response = {
        "tagged_query": tagged_query,
        "enriched_query": enriched_query, 
        "transformed_query": query_string,
        "tagger_data": tagger_data
    }

    return response


In [8]:
reviews_collection = engine.get_collection("reviews")
query_bytes = bytes("top kimchi near charlotte", 'UTF-8')
processed_query = process_semantic_query(reviews_collection, query_bytes)
print("Tagged Query:" + processed_query["tagged_query"])
print("\nEnriched Query:" + processed_query["enriched_query"])
print("\nTransformed Query:" + processed_query["transformed_query"])

{'responseHeader': {'zkConnected': True, 'status': 0, 'QTime': 51, 'params': {'json': '{"params": {"fore": "kimchi", "back": "*:*", "df": "text_t"}, "query": "*:*", "limit": 0, "facet": {"term_needing_vector": {"type": "query", "query": "kimchi", "facet": {"related_terms": {"type": "terms", "field": "text_t", "limit": 3, "sort": {"r1": "desc"}, "facet": {"r1": "relatedness($fore,$back)"}}, "doc_type": {"type": "terms", "field": "doc_type", "limit": 1, "sort": {"r2": "desc"}, "facet": {"r2": "relatedness($fore,$back)"}}}}}}'}}, 'response': {'numFound': 192140, 'start': 0, 'numFoundExact': True, 'docs': []}, 'facets': {'count': 192140, 'term_needing_vector': {'count': 193, 'related_terms': {'buckets': [{'val': 'kimchi', 'count': 193, 'r1': {'relatedness': 0.91934, 'foreground_popularity': 0.001, 'background_popularity': 0.001}}, {'val': 'korean', 'count': 95, 'r1': {'relatedness': 0.7069, 'foreground_popularity': 0.00049, 'background_popularity': 0.00366}}, {'val': 'banchan', 'count': 22

### Listing 7.8

In [10]:
# %load -s popularity webserver/semantic_search/semantic_functions/__init__
def popularity(query, position):
    if (len(query["query_tree"]) -1 > position):
        query["query_tree"][position] = {
            "type": "solr",
            "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False


### Listing 7.9

In [12]:
# %load -s location_distance,create_geo_filter webserver/semantic_search/semantic_functions/__init__
def location_distance(query, position):
    if (len(query["query_tree"]) -1 > position):
        next_entity = query["query_tree"][position + 1]
        if (next_entity["type"] == "city"):
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "solr",
                "query": create_geo_filter(next_entity['location_p'],
                "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_in_KM):
    return "+{!geofilt d=" + str(distance_in_KM) + ' sfield="' + field + '" pt="' + coordinates + '"}'


### Listing 7.10

In [18]:
# %load -s traverse_skg webserver/semantic_search/engine/semantic_knowledge_graph
def traverse_skg(collection, keyword):
    query = {
        "params": {"fore": keyword, "back": "*:*", "df": "text_t"},
        "query": "*:*", "limit": 0,
        "facet": {
            "term_needing_vector": {
                "type": "query", "query": keyword,
                "facet": {
                    "related_terms": {
                        "type": "terms", "field": "text_t",
                        "limit": 3, "sort": {"r1": "desc"},
                        "facet": {"r1": "relatedness($fore,$back)"}},
                "doc_type": {
                    "type": "terms", "field": "doc_type",
                    "limit": 1, "sort": {"r2": "desc"},
                    "facet": {"r2": "relatedness($fore,$back)"}}}}}}    
    return collection.search(query)


In [19]:
# %load -s parse_skg_response webserver/semantic_search/engine/semantic_knowledge_graph
def parse_skg_response(skg_response):
    parsed = {}
    related_term_nodes = {}
    if "facets" in skg_response and "term_needing_vector" in skg_response["facets"]:    
        if ("doc_type" in skg_response["facets"]["term_needing_vector"] and 
            "buckets" in skg_response["facets"]["term_needing_vector"]["doc_type"] and
            len(skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"]) > 0):
            parsed["category"] = skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"][0]["val"] #just top one for now
    
        if ("related_terms" in skg_response["facets"]["term_needing_vector"] 
            and "buckets" in skg_response["facets"]["term_needing_vector"]["related_terms"] 
          and len(skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]) > 0): #at least one entry    
            related_term_nodes = skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]
                
    term_vector = ""
    for related_term_node in related_term_nodes:
        if len(term_vector) > 0: term_vector += " " 
        term_vector += related_term_node["val"] + "^" + "{:.4f}".format(related_term_node["r1"]["relatedness"])
    
    parsed["term_vector"] = term_vector

    return parsed


In [20]:
query = "kimchi"
reviews_collection = engine.get_collection("reviews")
skg_response = traverse_skg(reviews_collection, query)
parse_skg_response(skg_response)

{'responseHeader': {'zkConnected': True, 'status': 0, 'QTime': 50, 'params': {'json': '{"params": {"fore": "kimchi", "back": "*:*", "df": "text_t"}, "query": "*:*", "limit": 0, "facet": {"term_needing_vector": {"type": "query", "query": "kimchi", "facet": {"related_terms": {"type": "terms", "field": "text_t", "limit": 3, "sort": {"r1": "desc"}, "facet": {"r1": "relatedness($fore,$back)"}}, "doc_type": {"type": "terms", "field": "doc_type", "limit": 1, "sort": {"r2": "desc"}, "facet": {"r2": "relatedness($fore,$back)"}}}}}}'}}, 'response': {'numFound': 192140, 'start': 0, 'numFoundExact': True, 'docs': []}, 'facets': {'count': 192140, 'term_needing_vector': {'count': 193, 'related_terms': {'buckets': [{'val': 'kimchi', 'count': 193, 'r1': {'relatedness': 0.91934, 'foreground_popularity': 0.001, 'background_popularity': 0.001}}, {'val': 'korean', 'count': 95, 'r1': {'relatedness': 0.7069, 'foreground_popularity': 0.00049, 'background_popularity': 0.00366}}, {'val': 'banchan', 'count': 22

{'term_vector': 'kimchi^0.9193 korean^0.7069 banchan^0.6593'}

In [22]:
def skg(query):
    return parse_skg_response(traverse_skg(reviews_collection, query))

other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries: print(f"{query}: {skg(query)}")


{'responseHeader': {'zkConnected': True, 'status': 0, 'QTime': 49, 'params': {'json': '{"params": {"fore": "bbq", "back": "*:*", "df": "text_t"}, "query": "*:*", "limit": 0, "facet": {"term_needing_vector": {"type": "query", "query": "bbq", "facet": {"related_terms": {"type": "terms", "field": "text_t", "limit": 3, "sort": {"r1": "desc"}, "facet": {"r1": "relatedness($fore,$back)"}}, "doc_type": {"type": "terms", "field": "doc_type", "limit": 1, "sort": {"r2": "desc"}, "facet": {"r2": "relatedness($fore,$back)"}}}}}}'}}, 'response': {'numFound': 192140, 'start': 0, 'numFoundExact': True, 'docs': []}, 'facets': {'count': 192140, 'term_needing_vector': {'count': 1512, 'related_terms': {'buckets': [{'val': 'bbq', 'count': 1512, 'r1': {'relatedness': 0.91907, 'foreground_popularity': 0.00787, 'background_popularity': 0.00787}}, {'val': 'ribs', 'count': 223, 'r1': {'relatedness': 0.61857, 'foreground_popularity': 0.00116, 'background_popularity': 0.0041}}, {'val': 'pork', 'count': 381, 'r1'

### Listing 7.11

In [25]:
# %load webserver/semantic_search/query_tree/__init__
def escape_quotes_in_query(query):
    return query.replace('"', '\\"')

def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            skg_response = traverse_skg(collection, item["surface_form"])
            enrichments = parse_skg_response(skg_response)
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        additional_query = ""
        match item["type"]:
            case "solr":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                additional_query = '{!edismax v="' + escape_quotes_in_query(query_string) + '"}'
            case "color":
                additional_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                additional_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                additional_query = f'+city_t:"{str(item["name"])}"'
            case "brand":
                additional_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                additional_query = "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"
        if additional_query:
            query_tree[i] = {"type": "solr", "query": additional_query}                    
    return query_tree

def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        item = query_tree[position]        
        # process commands. For now, going left to right and then sorting by priority when ambiguous commands occur; 
        # consider other weighting options later.
        if (item['type'] == "semantic_function"):
            commandIsResolved = False
    
            command = item['semantic_function']

            if (command):
                query = {"query_tree": query_tree} #pass by-ref
                commandIsResolved = eval(item['semantic_function']); #Careful... there is code in the docs that is being eval'd. 
                #MUST ENSURE THESE DOCS ARE SECURE, OTHERWISE THIS WILL INTRODUCE A POTENTIAL SECURITY THREAT (CODE INJECTION)
            
            #else:
                #Alert ("Error: " + query.query_tree.canonical_form + " has no command function.");
            
            if (False == commandIsResolved):
                #Bad command. Just remove for now... could alternatively keep it and run as a keyword
                query_tree.pop(position) #.splice(position,1)  

        position += 1

    return query_tree 

In [47]:
query = "good kimchi in charlotte"
tagger_data = engine.tag_query("entities", (bytes(query, "utf-8")))
query_tree = generate_query_tree(query, tagger_data)
enriched_query_tree = enrich(reviews_collection,query_tree)
processed_query_tree = transform_query(enriched_query_tree)

{'responseHeader': {'zkConnected': True, 'status': 0, 'QTime': 1196, 'params': {'json': '{"params": {"fore": "kimchi", "back": "*:*", "df": "text_t"}, "query": "*:*", "limit": 0, "facet": {"term_needing_vector": {"type": "query", "query": "kimchi", "facet": {"related_terms": {"type": "terms", "field": "text_t", "limit": 3, "sort": {"r1": "desc"}, "facet": {"r1": "relatedness($fore,$back)"}}, "doc_type": {"type": "terms", "field": "doc_type", "limit": 1, "sort": {"r2": "desc"}, "facet": {"r2": "relatedness($fore,$back)"}}}}}}'}}, 'response': {'numFound': 192140, 'start': 0, 'numFoundExact': True, 'docs': []}, 'facets': {'count': 192140, 'term_needing_vector': {'count': 193, 'related_terms': {'buckets': [{'val': 'kimchi', 'count': 193, 'r1': {'relatedness': 0.91934, 'foreground_popularity': 0.001, 'background_popularity': 0.001}}, {'val': 'korean', 'count': 95, 'r1': {'relatedness': 0.7069, 'foreground_popularity': 0.00049, 'background_popularity': 0.00366}}, {'val': 'banchan', 'count': 

### Listing 7.12

In [48]:
# %load -s to_query_string webserver/semantic_search/query_tree/__init__
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])


In [49]:
query_string = to_query_string(query_tree)
query_string

'+{!func v="mul(if(stars_i,stars_i,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean, Restaurants\\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'

In [50]:
reviews_collection = engine.get_collection("reviews")
request = {"query": query_string}
response = reviews_collection.search(request)
response

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 26,
  'params': {'json': '{"query": "+{!func v=\\"mul(if(stars_i,stars_i,0),20)\\"} {!edismax v=\\"kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\\\\\"Korean, Restaurants\\\\\\"\\"} +{!geofilt d=50 sfield=\\"location_p\\" pt=\\"35.22709,-80.84313\\"}"}'}},
 'response': {'numFound': 15842,
  'start': 0,
  'numFoundExact': True,
  'docs': [{'id': 'aKzNK6aaOrlL688lkXrXNA',
    'name_t': "Immaculate Mic's",
    'city_t': 'Concord',
    'state_t': 'NC',
    'text_t': "I do dog transports and was rescuing a Cane Corso from Vermont , when she got sick on the way home.  My daughter lives in Concord so I always stop there and decided to look up a detailing service.  I got a referral from Corso(Sweet Girl) had mad quite the mess with explosive diarrhea in the  the back of the car on the front passenger seat.  let's just say it was a god awful mess.  I contacted Immaculate Mic's after the referral and was happy to finally 

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)