# Semantic Search Application

In [1]:
import sys
sys.path.append('..')
sys.path.append("../webserver")
from aips import *
from webserver.semantic_search import *
from webserver.semantic_search.engine import *
from webserver.semantic_search.query_tree import *
engine = get_engine()

ModuleNotFoundError: No module named 'semantic_search'

## Starting the Reviews Search Web Server and Launching the Search Page

In [None]:
def get_running_webservers():
    already_running_webservers = ! ps -ef | grep '[s]tart-webserver.py' | awk '{print $2}'
    return already_running_webservers
    
def stop_running_webservers():
    already_running_webservers = get_running_webservers()
    for pid in already_running_webservers:
        print("Stopping webserver (pid: " + pid + ")")
        results = ! xargs kill -9 {pid}

def start_reviews_search_webserver():
    stop_running_webservers() #in case it was already running
    get_ipython().system = os.system
    ! cd ../webserver && python start-webserver.py &
    if len(get_running_webservers()) > 0:
        print("Successfully Started Webserver (pid: " + get_running_webservers()[0] + ")!")

### Listing 7.2

In [None]:
#Start the web server
start_reviews_search_webserver()


Successfully Started Webserver (pid: 47169)!


### Figure 7.2

In [None]:
%%html
<iframe src="http://localhost:2345/search" width=100% height="800"></iframe>


### Figure 7.3

In [None]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+near+charlotte" width=100% height="800"></iframe>

### Figure 7.4

In [None]:
%%html
<iframe src="http://localhost:2345/search?q=bbq+charlotte" width=100% height="800"></iframe>

### Figure 7.5

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+kimchi+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.6

In [None]:
%%html
<iframe src="http://localhost:2345/search?q=top+korean+bbq+near+charlotte&submit=false" width=100% height="800"></iframe>

### Figure 7.7

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=top+korean+bbq+near+charlotte&submit=true" width=100% height="800"></iframe>

### Listing 7.3

In [None]:
!! cat ../data/reviews/entities.csv

['id,surface_form,canonical_form,type,popularity,semantic_function',
 '1,near,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '2,in,{location_distance},semantic_function,100,"location_distance(query, position)"',
 '3,by,{location_distance},semantic_function,90,"location_distance(query, position)"',
 '4,by,{text_within_one_edit_distance},semantic_function,10,"text_within_one_edit_distance(query, position)"',
 '5,near,{text_distance},semantic_function,10,"text_distance(query, position)"',
 '6,popular,{popular},semantic_function,100,"popularity(query, position)"',
 '7,top,{popular},semantic_function,100,"popularity(query, position)"',
 '8,best,{popular},semantic_function,100,"popularity(query, position)"',
 '9,good,{popular},semantic_function,100,"popularity(query, position)"',
 '10,violet,violet,color,100,',
 '11,violet crowne,violet crowne,brand,100,',
 '12,violet crowne charlottesville,violet crowne charlottesville,movie_theater,100,',
 '13,violet crown

### Listing 7.4 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.4).

### Listing 7.5 : [located here](../ch07/1.index-datasets.ipynb#Listing-7.5).

### Listing 7.6

In [None]:
def tag_query(collection_name, query):
    url_params = "json.nl=map&sort=popularity%20desc&matchText=true&echoParams=all&fl=id,type,canonical_form,surface_form,name,country:countrycode_s,admin_area:admin_code_1_s,popularity,*_p,semantic_function"
    return requests.post(f"{SOLR_URL}/{collection_name}/tag?{url_params}", query).json()

In [None]:
query = "top korean bbq near charlotte"
tag_query("entities", query)

TypeError: tag_query() missing 1 required positional argument: 'query'

### Listing 7.7

In [None]:
# %load -s process_semantic_query webserver/semantic_search/__init__
def process_semantic_query(collection, query_bytes):
    text = query_bytes.decode('UTF-8')
    tagger_data = tag_query("entities", query_bytes)
    
    query_tree, tagged_query, enriched_query = generate_query_tree(text, tagger_data)
        
    enriched_query_tree = enrich(collection, query_tree)
    transformed = transform_query(enriched_query_tree)
    query_string = to_query_string(transformed)      

    response = {
        "tagged_query": tagged_query,
        "enriched_query": enriched_query, 
        "transformed_query": query_string,
        "tagger_data": tagger_data
    }

    return response


In [2]:
reviews_collection = engine.get_collection("reviews")
query_bytes = bytes("top korean bbq near charlotte", 'UTF-8')
processed_query = process_semantic_query(reviews_collection, query_bytes)
print("Tagged Query:" + processed_query["tagged_query"])
print("\nEnriched Query:" + processed_query["enriched_query"])
print("\nTransformed Query:" + processed_query["transformed_query"])

NameError: name 'engine' is not defined

### Listing 7.8

In [None]:
# %load -s popularity webserver/semantic_search/semantic_functions/__init__
def popularity(query, position):
    if (len(query["query_tree"]) -1 > position):
        query["query_tree"][position] = {
            "type": "solr",
            "query": '+{!func v="mul(if(stars_i,stars_i,0),20)"}'}
        return True
    else:
        return False


### Listing 7.9

In [None]:
# %load -s location_distance,create_geo_filter webserver/semantic_search/semantic_functions/__init__
def location_distance(query, position):
    if (len(query["query_tree"]) -1 > position):
        next_entity = query["query_tree"][position + 1]
        if (next_entity["type"] == "city"):
            query["query_tree"].pop(position + 1)
            query["query_tree"][position] = {
                "type": "solr",
                "query": create_geo_filter(next_entity['location_p'],
                "location_p", 50)}
            return True
    return False

def create_geo_filter(coordinates, field, distance_in_KM):
    return "+{!geofilt d=" + str(distance_in_KM) + ' sfield="' + field + '" pt="' + coordinates + '"}'


### Listing 7.10

In [None]:
# %load -s traverse_skg webserver/semantic_search/engine/semantic_knowledge_graph
def traverse_skg(collection, keyword):
    query = {
        "params": {"fore": keyword, "back": "*:*", "df": "text_t"},
        "query": "*:*", "limit": 0,
        "facet": {
            "term_needing_vector": {
                "type": "query", "query": keyword,
                "facet": {
                    "related_terms": {
                        "type": "terms", "field": "text_t",
                        "limit": 3, "sort": {"r1": "desc"},
                        "facet": {"r1": "relatedness($fore,$back)"}},
                "doc_type": {
                    "type": "terms", "field": "doc_type",
                    "limit": 1, "sort": {"r2": "desc"},
                    "facet": {"r2": "relatedness($fore,$back)"}}}}}}    
    return collection.search(query)


In [None]:
# %load -s parse_skg_response webserver/semantic_search/engine/semantic_knowledge_graph
def parse_skg_response(skg_response):
    parsed = {}
    related_term_nodes = {}
    if "facets" in skg_response and "term_needing_vector" in skg_response["facets"]:    
        if ("doc_type" in skg_response["facets"]["term_needing_vector"] and 
            "buckets" in skg_response["facets"]["term_needing_vector"]["doc_type"] and
            len(skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"]) > 0):
            parsed["category"] = skg_response["facets"]["term_needing_vector"]["doc_type"]["buckets"][0]["val"] #just top one for now
    
        if ("related_terms" in skg_response["facets"]["term_needing_vector"] 
            and "buckets" in skg_response["facets"]["term_needing_vector"]["related_terms"] 
          and len(skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]) > 0): #at least one entry    
            related_term_nodes = skg_response["facets"]["term_needing_vector"]["related_terms"]["buckets"]
                
    term_vector = ""
    for related_term_node in related_term_nodes:
        if len(term_vector) > 0: term_vector += " " 
        term_vector += related_term_node["val"] + "^" + "{:.4f}".format(related_term_node["r1"]["relatedness"])
    
    parsed["term_vector"] = term_vector

    return parsed


In [3]:
query = "korean bbq"
reviews_collection = engine.get_collection("reviews")
skg_response = traverse_skg(reviews_collection, query)
parse_skg_response(skg_response)

NameError: name 'engine' is not defined

In [None]:
def skg(query):
    return parse_skg_response(traverse_skg(reviews_collection, query))

other_queries = ["bbq", "korean bbq", "lasagna", "karaoke", "drive through"]
for query in other_queries: print(f"{query}: {skg(query)}")


bbq: {'category': 'Barbeque', 'term_vector': 'bbq^0.9191 ribs^0.6186 pork^0.5991'}
korean bbq: {'category': 'Korean', 'term_vector': 'bbq^0.9052 korean^0.8641 pork^0.6079'}
lasagna: {'category': 'Italian', 'term_vector': 'lasagna^0.9193 alfredo^0.3992 pasta^0.3909'}
karaoke: {'category': 'Karaoke', 'term_vector': 'karaoke^0.9193 sing^0.6423 songs^0.5256'}
drive through: {'category': 'Fast Food', 'term_vector': 'through^0.8999 drive^0.8613 thru^0.6118'}


### Listing 7.11

In [None]:
# %load webserver/semantic_search/query_tree/__init__
def escape_quotes_in_query(query):
    return query.replace('"', '\\"')

def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])

def enrich(collection, query_tree):
    query_tree = process_semantic_functions(query_tree)    
    for i in range(len(query_tree)):
        item = query_tree[i]
        if item["type"] == "keyword":
            skg_response = traverse_skg(collection, item["surface_form"])
            enrichments = parse_skg_response(skg_response)
            query_tree[i] = {"type": "skg_enriched", 
                             "enrichments": enrichments}                    
    return query_tree

def transform_query(query_tree):
    for i in range(len(query_tree)):
        item = query_tree[i]
        additional_query = ""
        match item["type"]:
            case "solr":
                pass
            case "skg_enriched":
                enrichments = item["enrichments"]
                query_string = ""
                
                if "term_vector" in enrichments:
                    query_string = enrichments["term_vector"]
                if "category" in enrichments and len(query_string) > 0:
                    query_string += f' +doc_type:"{enrichments["category"]}"'
                if (len(query_string) == 0):
                    query_string = item["surface_form"]
                    
                additional_query = '{!edismax v="' + escape_quotes_in_query(query_string) + '"}'
            case "color":
                additional_query = f'+colors_s:"{item["canonical_form"]}"'
            case "known_item" | "event":
                additional_query = f'+name_s:"{item["canonical_form"]}"'
            case "city":
                additional_query = f'+city_t:"{str(item["name"])}"'
            case "brand":
                additional_query = f'+brand_s:"{item["canonical_form"]}"'
            case _:
                additional_query = "+{!edismax v=\"" + escape_quotes_in_query(item["surface_form"]) + "\"}"
        if additional_query:
            query_tree[i] = {"type": "solr", "query": additional_query}                    
    return query_tree

def process_semantic_functions(query_tree):
    position = 0
    while position < len(query_tree):
        item = query_tree[position]        
        # process commands. For now, going left to right and then sorting by priority when ambiguous commands occur; 
        # consider other weighting options later.
        if (item['type'] == "semantic_function"):
            commandIsResolved = False
    
            command = item['semantic_function']

            if (command):
                query = {"query_tree": query_tree} #pass by-ref
                commandIsResolved = eval(item['semantic_function']); #Careful... there is code in the docs that is being eval'd. 
                #MUST ENSURE THESE DOCS ARE SECURE, OTHERWISE THIS WILL INTRODUCE A POTENTIAL SECURITY THREAT (CODE INJECTION)
            
            #else:
                #Alert ("Error: " + query.query_tree.canonical_form + " has no command function.");
            
            if (False == commandIsResolved):
                #Bad command. Just remove for now... could alternatively keep it and run as a keyword
                query_tree.pop(position) #.splice(position,1)  

        position += 1

    return query_tree 

In [None]:
query = "good korean bbq near charlotte"
tagger_data = engine.tag_query("entities", (bytes(query, "utf-8")))
query_tree = generate_query_tree(query, tagger_data)
enriched_query_tree = enrich(reviews_collection,query_tree)
processed_query_tree = transform_query(enriched_query_tree)

### Listing 7.12

In [None]:
# %load -s to_query_string webserver/semantic_search/query_tree/__init__
def to_query_string(query_tree):
    return " ".join([node["query"] for node in query_tree])


In [None]:
query_string = to_query_string(query_tree)
query_string

'+{!func v="mul(if(stars_i,stars_i,0),20)"} {!edismax v="kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\"Korean\\""} +{!geofilt d=50 sfield="location_p" pt="35.22709,-80.84313"}'

In [None]:
reviews_collection = engine.get_collection("reviews")
request = {"query": query_string}
response = reviews_collection.search(request)
response

{'responseHeader': {'zkConnected': True,
  'status': 0,
  'QTime': 10,
  'params': {'json': '{"query": "+{!func v=\\"mul(if(stars_i,stars_i,0),20)\\"} {!edismax v=\\"kimchi^0.9193 korean^0.7069 banchan^0.6593 +doc_type:\\\\\\"Korean\\\\\\"\\"} +{!geofilt d=50 sfield=\\"location_p\\" pt=\\"35.22709,-80.84313\\"}"}'}},
 'response': {'numFound': 15842,
  'start': 0,
  'docs': [{'id': 'WnLhd38sH80ViWwzyF7yoA',
    'name_t': 'Hibiscus',
    'city_t': 'Charlotte',
    'state_t': 'NC',
    'text_t': "We ate here for dinner and had a very tasty meal of bibimbap and bulgogi. Both dishes were done well and tasty. We had a very pleasant waitress who provided great service as well. Overall great! We'll definitely be back.",
    'stars_i': 5,
    'categories_t': 'Restaurants, Korean, Thai',
    'doc_type': ['Restaurants, Korean, Thai'],
    'location_pt_s': '35.171873,-80.849032',
    'location_p': '35.171873,-80.849032',
    'type_ss': ['Restaurants, Korean, Thai'],
    'latitude_d': 35.171873,
  

In [None]:
%%html
<iframe src="http://localhost:2345/semantic-search?q=good+kimchi+near+charlotte&submit=true" width=100% height="800"></iframe>

In [None]:
#Cleanup so webserver doesn't keep running after you're done
stop_running_webservers()

Stopping webserver (pid: 47169)


## Success!

Up next: Chapter 8 - [Signals Boosting Models](../ch08/1.signals-boosting.ipynb)